{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-12-14T03:51:43.649914Z",
     "start_time": "2018-12-14T03:51:43.644307Z"
    }
   },
   "outputs": [],
   "source": [
    "# 1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-12-14T03:51:43.662328Z",
     "start_time": "2018-12-14T03:51:43.655762Z"
    }
   },
   "outputs": [],
   "source": [
    "# Some Exploratory Data Analysis (EDA) with a dataset of ~2M real user passwords\n",
    "# Use some powerful Python-based tools such as Pandas, Scikit-learn\n",
    "# Set up some interesting problems"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-12-14T03:51:44.475053Z",
     "start_time": "2018-12-14T03:51:43.668318Z"
    }
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Skipping line 6408: expected 1 fields, saw 6\n",
      "Skipping line 34362: expected 1 fields, saw 6\n",
      "Skipping line 54809: expected 1 fields, saw 6\n",
      "Skipping line 68015: expected 1 fields, saw 6\n",
      "Skipping line 195470: expected 1 fields, saw 6\n",
      "Skipping line 206508: expected 1 fields, saw 6\n",
      "Skipping line 239978: expected 1 fields, saw 6\n",
      "Skipping line 258251: expected 1 fields, saw 6\n",
      "Skipping line 260010: expected 1 fields, saw 6\n",
      "Skipping line 260347: expected 1 fields, saw 6\n",
      "Skipping line 284675: expected 1 fields, saw 6\n",
      "Skipping line 303212: expected 1 fields, saw 6\n",
      "Skipping line 354210: expected 1 fields, saw 6\n",
      "Skipping line 372222: expected 1 fields, saw 6\n",
      "Skipping line 380179: expected 1 fields, saw 6\n",
      "Skipping line 425527: expected 1 fields, saw 6\n",
      "Skipping line 430725: expected 1 fields, saw 6\n",
      "Skipping line 444114: expected 1 fields, saw 6\n",
      "Skipping line 460592: expected 1 fields, saw 6\n",
      "Skipping line 462665: expected 1 fields, saw 6\n",
      "Skipping line 480616: expected 1 fields, saw 6\n",
      "Skipping line 484932: expected 1 fields, saw 6\n",
      "\n",
      "Skipping line 526558: expected 1 fields, saw 6\n",
      "Skipping line 535219: expected 1 fields, saw 6\n",
      "Skipping line 551270: expected 1 fields, saw 6\n",
      "Skipping line 618773: expected 1 fields, saw 6\n",
      "Skipping line 681879: expected 1 fields, saw 6\n",
      "Skipping line 745057: expected 1 fields, saw 6\n",
      "Skipping line 754477: expected 1 fields, saw 6\n",
      "Skipping line 779937: expected 1 fields, saw 6\n",
      "Skipping line 799786: expected 1 fields, saw 6\n",
      "Skipping line 805730: expected 1 fields, saw 6\n",
      "Skipping line 828277: expected 1 fields, saw 6\n",
      "Skipping line 839992: expected 1 fields, saw 6\n",
      "Skipping line 874546: expected 1 fields, saw 6\n",
      "Skipping line 939295: expected 1 fields, saw 6\n",
      "Skipping line 986671: expected 1 fields, saw 6\n",
      "Skipping line 991398: expected 1 fields, saw 6\n",
      "Skipping line 995599: expected 1 fields, saw 6\n",
      "Skipping line 1039089: expected 1 fields, saw 6\n",
      "Skipping line 1041030: expected 1 fields, saw 6\n",
      "\n"
     ]
    }
   ],
   "source": [
    "# pandas is a powerful Python-based data package that can handle large quantities of row/column data\n",
    "# we will use pandas many times during these videos. a 2D group of data in pandas is called a 'DataFrame'\n",
    "\n",
    "# import pandas\n",
    "import pandas as pd\n",
    "\n",
    "# use the read_csv method to read in a local file of ~2M passwords from real users of a wesite.\n",
    "# here we specify that there is no header in the file (no titles of columns) (header=None)\n",
    "# we also specify that if any row gives us an error, skip over it (error_bad_lines=False)\n",
    "data = pd.read_csv('../data/passwords.txt', header=None, error_bad_lines=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-12-14T03:51:44.616133Z",
     "start_time": "2018-12-14T03:51:44.480589Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(1048489, 1)\n",
      "(1048485, 1)\n"
     ]
    }
   ],
   "source": [
    "# shape attribute gives us tuple of (# rows, # cols)\n",
    "\n",
    "# 1,048,489 passwords\n",
    "print data.shape\n",
    "\n",
    "# the dropna method will remove any null values from our dataset. We have to include the inplace in order for the\n",
    "# change to take effect\n",
    "data.dropna(inplace=True)\n",
    "# still 1,048,485 passwords after dropping null values\n",
    "print data.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-12-14T03:51:44.642248Z",
     "start_time": "2018-12-14T03:51:44.622032Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>text</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>7606374520</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>piontekendre</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>rambo144</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>primoz123</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>sal1387</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "           text\n",
       "0    7606374520\n",
       "1  piontekendre\n",
       "2      rambo144\n",
       "3     primoz123\n",
       "4       sal1387"
      ]
     },
     "execution_count": 26,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# let's change the name of our columns to make it make more sense\n",
    "data.columns = ['text']\n",
    "\n",
    "# the head method will return the first n rows (default 5)\n",
    "\n",
    "data.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-12-14T03:51:44.660454Z",
     "start_time": "2018-12-14T03:51:44.646426Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>text</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>7606374520</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>piontekendre</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>rambo144</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "           text\n",
       "0    7606374520\n",
       "1  piontekendre\n",
       "2      rambo144"
      ]
     },
     "execution_count": 27,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# change the n parameters\n",
    "data.head(3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-12-14T03:51:44.686854Z",
     "start_time": "2018-12-14T03:51:44.664879Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>text</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>1048484</th>\n",
       "      <td>Megalon324</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1048485</th>\n",
       "      <td>styler9</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1048486</th>\n",
       "      <td>f1maxmax</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1048487</th>\n",
       "      <td>1QAZ2345</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1048488</th>\n",
       "      <td>9898981072</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "               text\n",
       "1048484  Megalon324\n",
       "1048485     styler9\n",
       "1048486    f1maxmax\n",
       "1048487    1QAZ2345\n",
       "1048488  9898981072"
      ]
     },
     "execution_count": 28,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# the tail method will return the last n rows (default 5)\n",
    "data.tail()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-12-14T03:51:44.711028Z",
     "start_time": "2018-12-14T03:51:44.691327Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>text</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>1048479</th>\n",
       "      <td>hjvf</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1048480</th>\n",
       "      <td>guessmine</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1048481</th>\n",
       "      <td>ed130287</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1048482</th>\n",
       "      <td>zalakaros</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1048483</th>\n",
       "      <td>vista5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1048484</th>\n",
       "      <td>Megalon324</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1048485</th>\n",
       "      <td>styler9</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1048486</th>\n",
       "      <td>f1maxmax</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1048487</th>\n",
       "      <td>1QAZ2345</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1048488</th>\n",
       "      <td>9898981072</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "               text\n",
       "1048479        hjvf\n",
       "1048480   guessmine\n",
       "1048481    ed130287\n",
       "1048482   zalakaros\n",
       "1048483      vista5\n",
       "1048484  Megalon324\n",
       "1048485     styler9\n",
       "1048486    f1maxmax\n",
       "1048487    1QAZ2345\n",
       "1048488  9898981072"
      ]
     },
     "execution_count": 29,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# change the n parameter to see more\n",
    "data.tail(10)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 145,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-12-16T23:01:14.503922Z",
     "start_time": "2018-12-16T23:01:13.122659Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.series.Series'>\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "0            21\n",
       "123          12\n",
       "1            10\n",
       "8             8\n",
       "123456        8\n",
       "1230          7\n",
       "5             7\n",
       "123456789     7\n",
       "2             7\n",
       "12345         6\n",
       "Name: text, dtype: int64"
      ]
     },
     "execution_count": 145,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# we will grab a single column from our dataframe. A 1D version of a DataFrame is called a Series\n",
    "text = data['text']\n",
    "\n",
    "# show the type of the variable text\n",
    "print type(text)\n",
    "\n",
    "# the value_counts method will count the unique elements of a Series or DataFrame and show the most used passwords\n",
    "# in this case, no password repeats itself more than 2 times\n",
    "text.value_counts()[:10]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-12-14T03:51:46.105160Z",
     "start_time": "2018-12-14T03:51:46.093597Z"
    }
   },
   "outputs": [],
   "source": [
    "# Let's add some features to our DataFrame. We will eventually use these columns to do some machine learning.\n",
    "\n",
    "# The columns we want to add are:\n",
    "\n",
    "# 1. the length of the password\n",
    "# 2. The number of characters in caps\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-12-14T03:51:46.483837Z",
     "start_time": "2018-12-14T03:51:46.112466Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>text</th>\n",
       "      <th>length</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>7606374520</td>\n",
       "      <td>10</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>piontekendre</td>\n",
       "      <td>12</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>rambo144</td>\n",
       "      <td>8</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>primoz123</td>\n",
       "      <td>9</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>sal1387</td>\n",
       "      <td>7</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "           text  length\n",
       "0    7606374520      10\n",
       "1  piontekendre      12\n",
       "2      rambo144       8\n",
       "3     primoz123       9\n",
       "4       sal1387       7"
      ]
     },
     "execution_count": 32,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 1. the length of the password\n",
    "\n",
    "# on the left of the equal sign, note we are defining a new column called 'length'. We want this column to hold the \n",
    "# length of the password. \n",
    "\n",
    "# on the right of the equal sign, we use the apply method of pandas Series/DFs. We will apply a function (len in this case)\n",
    "# to every element in the column 'text'\n",
    "\n",
    "data['length'] = data['text'].apply(len)\n",
    "\n",
    "# see our changes take effect\n",
    "data.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 150,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-12-16T23:02:37.194229Z",
     "start_time": "2018-12-16T23:02:35.675935Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "123456       8\n",
       "123456789    7\n",
       "12345        6\n",
       "43162        5\n",
       "7758521      5\n",
       "11111        5\n",
       "5201314      5\n",
       "111111       4\n",
       "123321       4\n",
       "102030       4\n",
       "Name: text, dtype: int64"
      ]
     },
     "execution_count": 150,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# top passwords of length 5 or more\n",
    "data[data.length > 4][\"text\"].value_counts()[:10]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-12-14T03:51:48.067028Z",
     "start_time": "2018-12-14T03:51:46.486552Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0\n",
      "2\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "0          0\n",
       "1          0\n",
       "2          0\n",
       "3          0\n",
       "4          0\n",
       "5          8\n",
       "6          1\n",
       "7          0\n",
       "8          0\n",
       "9          0\n",
       "10         0\n",
       "11         0\n",
       "12         0\n",
       "13         0\n",
       "14         0\n",
       "15         0\n",
       "16         0\n",
       "17         0\n",
       "18         0\n",
       "19         0\n",
       "20         0\n",
       "21         0\n",
       "22         0\n",
       "23         0\n",
       "24         0\n",
       "25         0\n",
       "26         0\n",
       "27         0\n",
       "28         0\n",
       "29         0\n",
       "          ..\n",
       "1048459    0\n",
       "1048460    1\n",
       "1048461    0\n",
       "1048462    1\n",
       "1048463    0\n",
       "1048464    0\n",
       "1048465    2\n",
       "1048466    0\n",
       "1048467    5\n",
       "1048468    0\n",
       "1048469    0\n",
       "1048470    0\n",
       "1048471    0\n",
       "1048472    0\n",
       "1048473    0\n",
       "1048474    0\n",
       "1048475    0\n",
       "1048476    0\n",
       "1048477    0\n",
       "1048478    0\n",
       "1048479    0\n",
       "1048480    0\n",
       "1048481    0\n",
       "1048482    0\n",
       "1048483    0\n",
       "1048484    1\n",
       "1048485    0\n",
       "1048486    0\n",
       "1048487    3\n",
       "1048488    0\n",
       "Name: text, Length: 1048485, dtype: int64"
      ]
     },
     "execution_count": 33,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 2. The number of characters in caps\n",
    "\n",
    "# Here we will create our own function and pass that function into apply to count the number of characters\n",
    "# that are in all caps\n",
    "\n",
    "def caps(my_string):\n",
    "    return sum([1 for _ in my_string if _.isupper()])\n",
    "\n",
    "print caps(\"all lower case\")\n",
    "print caps(\"soMe lower caSe\")\n",
    "\n",
    "# apply the caps function to the entire series\n",
    "data['text'].apply(caps)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-12-14T03:51:49.508637Z",
     "start_time": "2018-12-14T03:51:48.072415Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>text</th>\n",
       "      <th>length</th>\n",
       "      <th>num_caps</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>7606374520</td>\n",
       "      <td>10</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>piontekendre</td>\n",
       "      <td>12</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>rambo144</td>\n",
       "      <td>8</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>primoz123</td>\n",
       "      <td>9</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>sal1387</td>\n",
       "      <td>7</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>EVASLRDG</td>\n",
       "      <td>8</td>\n",
       "      <td>8</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>Detroit84</td>\n",
       "      <td>9</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>dlbd090505</td>\n",
       "      <td>10</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>snoesje12</td>\n",
       "      <td>9</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>56412197</td>\n",
       "      <td>8</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "           text  length  num_caps\n",
       "0    7606374520      10         0\n",
       "1  piontekendre      12         0\n",
       "2      rambo144       8         0\n",
       "3     primoz123       9         0\n",
       "4       sal1387       7         0\n",
       "5      EVASLRDG       8         8\n",
       "6     Detroit84       9         1\n",
       "7    dlbd090505      10         0\n",
       "8     snoesje12       9         0\n",
       "9      56412197       8         0"
      ]
     },
     "execution_count": 34,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# store a new column\n",
    "data['num_caps'] = data['text'].apply(caps)\n",
    "\n",
    "# see our changes take effect\n",
    "data.head(10)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-12-14T03:51:50.932454Z",
     "start_time": "2018-12-14T03:51:49.514048Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>text</th>\n",
       "      <th>length</th>\n",
       "      <th>num_caps</th>\n",
       "      <th>lambda_num_caps</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>7606374520</td>\n",
       "      <td>10</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>piontekendre</td>\n",
       "      <td>12</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>rambo144</td>\n",
       "      <td>8</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>primoz123</td>\n",
       "      <td>9</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>sal1387</td>\n",
       "      <td>7</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>EVASLRDG</td>\n",
       "      <td>8</td>\n",
       "      <td>8</td>\n",
       "      <td>8</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>Detroit84</td>\n",
       "      <td>9</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>dlbd090505</td>\n",
       "      <td>10</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>snoesje12</td>\n",
       "      <td>9</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>56412197</td>\n",
       "      <td>8</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "           text  length  num_caps  lambda_num_caps\n",
       "0    7606374520      10         0                0\n",
       "1  piontekendre      12         0                0\n",
       "2      rambo144       8         0                0\n",
       "3     primoz123       9         0                0\n",
       "4       sal1387       7         0                0\n",
       "5      EVASLRDG       8         8                8\n",
       "6     Detroit84       9         1                1\n",
       "7    dlbd090505      10         0                0\n",
       "8     snoesje12       9         0                0\n",
       "9      56412197       8         0                0"
      ]
     },
     "execution_count": 35,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# we could have also used a lambda function if we do not with to keep the method caps in memory\n",
    "# this will produce the same result as above\n",
    "data['lambda_num_caps'] = data['text'].apply(lambda x: sum([1 for _ in x if _.isupper()]))\n",
    "\n",
    "# see our changes take effet\n",
    "data.head(10)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-12-14T03:51:51.016620Z",
     "start_time": "2018-12-14T03:51:50.937604Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>text</th>\n",
       "      <th>length</th>\n",
       "      <th>num_caps</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>7606374520</td>\n",
       "      <td>10</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>piontekendre</td>\n",
       "      <td>12</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>rambo144</td>\n",
       "      <td>8</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>primoz123</td>\n",
       "      <td>9</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>sal1387</td>\n",
       "      <td>7</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "           text  length  num_caps\n",
       "0    7606374520      10         0\n",
       "1  piontekendre      12         0\n",
       "2      rambo144       8         0\n",
       "3     primoz123       9         0\n",
       "4       sal1387       7         0"
      ]
     },
     "execution_count": 36,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# delete a column in pandas\n",
    "\n",
    "# the 'drop' method can drop both rows and column. In this case, we specify the axis as 1 (meaning columns)\n",
    "# we also have to include inplace in order for the changes to take effect\n",
    "data.drop('lambda_num_caps', axis=1, inplace=True)\n",
    "\n",
    "# see our changes\n",
    "data.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-12-14T03:51:51.208806Z",
     "start_time": "2018-12-14T03:51:51.021578Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>length</th>\n",
       "      <th>num_caps</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>count</th>\n",
       "      <td>1.048485e+06</td>\n",
       "      <td>1.048485e+06</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>mean</th>\n",
       "      <td>8.390173e+00</td>\n",
       "      <td>2.575392e-01</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>std</th>\n",
       "      <td>2.269470e+01</td>\n",
       "      <td>1.205588e+00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>min</th>\n",
       "      <td>1.000000e+00</td>\n",
       "      <td>0.000000e+00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>25%</th>\n",
       "      <td>7.000000e+00</td>\n",
       "      <td>0.000000e+00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>50%</th>\n",
       "      <td>8.000000e+00</td>\n",
       "      <td>0.000000e+00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>75%</th>\n",
       "      <td>9.000000e+00</td>\n",
       "      <td>0.000000e+00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>max</th>\n",
       "      <td>8.192000e+03</td>\n",
       "      <td>2.690000e+02</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "             length      num_caps\n",
       "count  1.048485e+06  1.048485e+06\n",
       "mean   8.390173e+00  2.575392e-01\n",
       "std    2.269470e+01  1.205588e+00\n",
       "min    1.000000e+00  0.000000e+00\n",
       "25%    7.000000e+00  0.000000e+00\n",
       "50%    8.000000e+00  0.000000e+00\n",
       "75%    9.000000e+00  0.000000e+00\n",
       "max    8.192000e+03  2.690000e+02"
      ]
     },
     "execution_count": 37,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# grab some basic descriptive statistics\n",
    "data.describe()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-12-14T03:51:51.220034Z",
     "start_time": "2018-12-14T03:51:51.214204Z"
    }
   },
   "outputs": [],
   "source": [
    "# hmm, looks like the 'max' row is telling us we have some massive passwords... Let's check that out"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-12-14T03:51:51.268780Z",
     "start_time": "2018-12-14T03:51:51.224301Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>text</th>\n",
       "      <th>length</th>\n",
       "      <th>num_caps</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>11879</th>\n",
       "      <td>8</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>15346</th>\n",
       "      <td>7</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>16983</th>\n",
       "      <td>*</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>43379</th>\n",
       "      <td>8</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>64848</th>\n",
       "      <td>.</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "      text  length  num_caps\n",
       "11879    8       1         0\n",
       "15346    7       1         0\n",
       "16983    *       1         0\n",
       "43379    8       1         0\n",
       "64848    .       1         0"
      ]
     },
     "execution_count": 39,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# filtering rows\n",
    "\n",
    "# we can use the square bracket operator to pass in conditions for rows that we would like to see from our dataset\n",
    "# in this case, we want to see all rows in which the length of the password is 1\n",
    "\n",
    "data[data['length'] == 1].head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-12-14T03:51:51.327541Z",
     "start_time": "2018-12-14T03:51:51.274469Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>text</th>\n",
       "      <th>length</th>\n",
       "      <th>num_caps</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>EVASLRDG</td>\n",
       "      <td>8</td>\n",
       "      <td>8</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>Detroit84</td>\n",
       "      <td>9</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>36</th>\n",
       "      <td>NaJT5UCH</td>\n",
       "      <td>8</td>\n",
       "      <td>6</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>93</th>\n",
       "      <td>UlMtTX</td>\n",
       "      <td>6</td>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>104</th>\n",
       "      <td>SL123456sl</td>\n",
       "      <td>10</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "           text  length  num_caps\n",
       "5      EVASLRDG       8         8\n",
       "6     Detroit84       9         1\n",
       "36     NaJT5UCH       8         6\n",
       "93       UlMtTX       6         4\n",
       "104  SL123456sl      10         2"
      ]
     },
     "execution_count": 40,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# let's see rows where we have at least 1 caps character\n",
    "\n",
    "data[data['num_caps'] > 0].head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-12-14T03:51:51.367966Z",
     "start_time": "2018-12-14T03:51:51.337366Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(100185, 3)"
      ]
     },
     "execution_count": 41,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# let's count these rows\n",
    "\n",
    "data[data.num_caps > 0].shape\n",
    "# 100,185 rows have at least one caps in it out of about a million"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-12-14T03:51:51.400747Z",
     "start_time": "2018-12-14T03:51:51.373478Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>text</th>\n",
       "      <th>length</th>\n",
       "      <th>num_caps</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>38830</th>\n",
       "      <td>&gt;&lt;script&gt;alert(1)&lt;/script&gt;\\r123Lenda#\\rhallibu...</td>\n",
       "      <td>8192</td>\n",
       "      <td>242</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>387398</th>\n",
       "      <td>\\r251885394\\rmello2\\rmaitre1123\\rfk6Ehruu\\rthi...</td>\n",
       "      <td>8192</td>\n",
       "      <td>176</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>451793</th>\n",
       "      <td>39&lt;0Y~c.;A1Bj\\r3ddd4t\\r516ks516\\rag0931266\\rac...</td>\n",
       "      <td>8192</td>\n",
       "      <td>223</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>517600</th>\n",
       "      <td>12345\\rhdjcb100\\r060571\\rkaalimaa\\rrelaxmax\\rd...</td>\n",
       "      <td>8192</td>\n",
       "      <td>184</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>580134</th>\n",
       "      <td>or1=1--\\r13817676085\\r594112\\rmactools\\r880148...</td>\n",
       "      <td>8192</td>\n",
       "      <td>216</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>752693</th>\n",
       "      <td>pass\\rmbmb266888\\r1988luolin\\r15877487956\\rcri...</td>\n",
       "      <td>8192</td>\n",
       "      <td>180</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>841857</th>\n",
       "      <td>==)!)(=\\raviral\\rrimmir33\\rhutcheson\\rrr801201...</td>\n",
       "      <td>8192</td>\n",
       "      <td>269</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1013991</th>\n",
       "      <td>AAj6H\\rweebeth\\rmonitor222\\rem1981\\ralexs123\\r...</td>\n",
       "      <td>8192</td>\n",
       "      <td>269</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                                      text  length  num_caps\n",
       "38830    ><script>alert(1)</script>\\r123Lenda#\\rhallibu...    8192       242\n",
       "387398   \\r251885394\\rmello2\\rmaitre1123\\rfk6Ehruu\\rthi...    8192       176\n",
       "451793   39<0Y~c.;A1Bj\\r3ddd4t\\r516ks516\\rag0931266\\rac...    8192       223\n",
       "517600   12345\\rhdjcb100\\r060571\\rkaalimaa\\rrelaxmax\\rd...    8192       184\n",
       "580134   or1=1--\\r13817676085\\r594112\\rmactools\\r880148...    8192       216\n",
       "752693   pass\\rmbmb266888\\r1988luolin\\r15877487956\\rcri...    8192       180\n",
       "841857   ==)!)(=\\raviral\\rrimmir33\\rhutcheson\\rrr801201...    8192       269\n",
       "1013991  AAj6H\\rweebeth\\rmonitor222\\rem1981\\ralexs123\\r...    8192       269"
      ]
     },
     "execution_count": 42,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# let's see our long passwords\n",
    "data[data.length > 100]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-12-14T03:51:51.419522Z",
     "start_time": "2018-12-14T03:51:51.404904Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Int64Index([38830, 387398, 451793, 517600, 580134, 752693, 841857, 1013991], dtype='int64')"
      ]
     },
     "execution_count": 43,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data[data.length > 100].index"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-12-14T03:51:51.583993Z",
     "start_time": "2018-12-14T03:51:51.425268Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(8, 3)\n",
      "(1048477, 3)\n"
     ]
    }
   ],
   "source": [
    "print data[data.length > 100].shape  # only 8 rows that messed up\n",
    "# to make this easy, let's just drop those problematic rows\n",
    "\n",
    "# we will drop passwords that are too long..\n",
    "data.drop(data[data.length > 100].index, axis=0, inplace=True)\n",
    "\n",
    "# 1,048,485 - 8 == 1,048,477\n",
    "print data.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-12-14T03:51:51.801192Z",
     "start_time": "2018-12-14T03:51:51.588320Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>length</th>\n",
       "      <th>num_caps</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>count</th>\n",
       "      <td>1.048477e+06</td>\n",
       "      <td>1.048477e+06</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>mean</th>\n",
       "      <td>8.327732e+00</td>\n",
       "      <td>2.558635e-01</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>std</th>\n",
       "      <td>2.012173e+00</td>\n",
       "      <td>1.037190e+00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>min</th>\n",
       "      <td>1.000000e+00</td>\n",
       "      <td>0.000000e+00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>25%</th>\n",
       "      <td>7.000000e+00</td>\n",
       "      <td>0.000000e+00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>50%</th>\n",
       "      <td>8.000000e+00</td>\n",
       "      <td>0.000000e+00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>75%</th>\n",
       "      <td>9.000000e+00</td>\n",
       "      <td>0.000000e+00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>max</th>\n",
       "      <td>2.900000e+01</td>\n",
       "      <td>2.800000e+01</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "             length      num_caps\n",
       "count  1.048477e+06  1.048477e+06\n",
       "mean   8.327732e+00  2.558635e-01\n",
       "std    2.012173e+00  1.037190e+00\n",
       "min    1.000000e+00  0.000000e+00\n",
       "25%    7.000000e+00  0.000000e+00\n",
       "50%    8.000000e+00  0.000000e+00\n",
       "75%    9.000000e+00  0.000000e+00\n",
       "max    2.900000e+01  2.800000e+01"
      ]
     },
     "execution_count": 45,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data.describe()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-12-14T03:51:51.812221Z",
     "start_time": "2018-12-14T03:51:51.806183Z"
    }
   },
   "outputs": [],
   "source": [
    "# go over plotting\n",
    "# sorting\n",
    "# domain knowledge"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 47,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-12-14T03:51:52.196706Z",
     "start_time": "2018-12-14T03:51:51.815417Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<matplotlib.axes._subplots.AxesSubplot at 0x117546950>"
      ]
     },
     "execution_count": 47,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# basics of plotting\n",
    "\n",
    "# pandas has a built in plotting feature to do very simple figures like histograms\n",
    "# this is showing us the average is definitely around 8.327 as stated in the describe method\n",
    "# this data also seems to be fairly normally distributed\n",
    "data['length'].hist()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 48,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-12-14T03:51:52.560263Z",
     "start_time": "2018-12-14T03:51:52.200631Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<matplotlib.axes._subplots.AxesSubplot at 0x11785b150>"
      ]
     },
     "execution_count": 48,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAZMAAAD8CAYAAACyyUlaAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMi4zLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvIxREBQAAEu9JREFUeJzt3HGsXvV93/H3Z7i0Ll2CA9MVstnMVmsVBW0lV4QpVXVVNmLINDOpRSBUnAjFk0LadEFanP5DlQiJTKVpQCmSV1hAYqEszWZrdcIskkdt/4ACaRQHWMYVMcWWgRYTqBO1kZPv/nh+bh/unnttnp/dx8/D+yVd3XO+53fO7/e7B/y5zznnnlQVkiT1+AfTHoAkafYZJpKkboaJJKmbYSJJ6maYSJK6GSaSpG6GiSSpm2EiSepmmEiSuq2b9gD+vpx//vm1efPmifb93ve+xznnnHNqB3SGmNe5Oa/ZM69zm/V5PfXUU39ZVf/oRO3eNmGyefNmnnzyyYn2HQwGLC0tndoBnSHmdW7Oa/bM69xmfV5JXjiZdl7mkiR1M0wkSd0ME0lSN8NEktTNMJEkdTNMJEndDBNJUrcThkmS+5K8kuRbI7V3JdmX5Ln2fUOrJ8ldSZaTfDPJZSP7bG/tn0uyfaT+7iT72z53JcmkfUiSpuNkPpl8Hti6orYTeLSqtgCPtnWAq4Et7WsHcA8MgwG4DXgPcDlw2/FwaG0+NLLf1kn6kCRNzwn/Ar6q/ijJ5hXlbcBSW74fGAAfb/UHqqqAx5Kcm+SC1nZfVR0BSLIP2JpkALyjqh5r9QeAa4Evv9U+qurwW5v6ydt/6HU+sPMPT9fh13TgjvdPpV9JeismvWeyMPKP90vAQlveCLw40u5gq61VPzimPkkfkqQp6X43V1VVkjoVgznVfSTZwfBSGAsLCwwGg4n6X1gPt156bKJ9e0065pN19OjR097HNDiv2TOvc5vXea00aZi8fPzSUruM9UqrHwIuHGm3qdUO8XeXrI7XB62+aUz7Sfr4/1TVLmAXwOLiYk36srW7H9zNnfun807MAzcundbjz/pL6FbjvGbPvM5tXue10qSXufYAx5/I2g7sHqnf1J64ugJ4vV2qegS4KsmGduP9KuCRtu2NJFe0p7huWnGst9KHJGlKTvjrdpIvMPxUcX6SgwyfyroDeDjJzcALwHWt+V7gGmAZ+D7wQYCqOpLkU8ATrd0nj9+MBz7M8Imx9QxvvH+51d9SH5Kk6TmZp7luWGXTlWPaFnDLKse5D7hvTP1J4JIx9Vffah+SpOnwL+AlSd0ME0lSN8NEktTNMJEkdTNMJEndDBNJUjfDRJLUzTCRJHUzTCRJ3QwTSVI3w0SS1M0wkSR1M0wkSd0ME0lSN8NEktTNMJEkdTNMJEndDBNJUjfDRJLUzTCRJHUzTCRJ3QwTSVI3w0SS1M0wkSR1M0wkSd0ME0lSN8NEktTNMJEkdTNMJEndDBNJUjfDRJLUzTCRJHXrCpMk/zHJ00m+leQLSX4iyUVJHk+ynOT3k5zd2v54W19u2zePHOcTrf7tJO8bqW9tteUkO0fqY/uQJE3HxGGSZCPwa8BiVV0CnAVcD3wa+ExV/TTwGnBz2+Vm4LVW/0xrR5KL234/C2wFfjfJWUnOAj4HXA1cDNzQ2rJGH5KkKei9zLUOWJ9kHfCTwGHgF4Evtu33A9e25W1tnbb9yiRp9Yeq6m+q6jvAMnB5+1ququer6gfAQ8C2ts9qfUiSpmDiMKmqQ8BvAX/OMEReB54CvltVx1qzg8DGtrwReLHte6y1P2+0vmKf1ernrdGHJGkK1k26Y5INDD9VXAR8F/jvDC9TnTGS7AB2ACwsLDAYDCY6zsJ6uPXSYydueBpMOuaTdfTo0dPexzQ4r9kzr3Ob13mtNHGYAP8a+E5V/QVAki8B7wXOTbKufXLYBBxq7Q8BFwIH22WxdwKvjtSPG91nXP3VNfp4k6raBewCWFxcrKWlpYkmeveDu7lzf8+PanIHblw6rccfDAZM+nM5kzmv2TOvc5vXea3Uc8/kz4Erkvxku49xJfAM8DXgl1qb7cDutrynrdO2f7WqqtWvb097XQRsAf4UeALY0p7cOpvhTfo9bZ/V+pAkTUHPPZPHGd4E/zqwvx1rF/Bx4GNJlhne37i37XIvcF6rfwzY2Y7zNPAwwyD6CnBLVf2wfer4CPAI8CzwcGvLGn1Ikqag69pNVd0G3Lai/DzDJ7FWtv1r4JdXOc7twO1j6nuBvWPqY/uQJE2HfwEvSepmmEiSuhkmkqRuhokkqZthIknqZphIkroZJpKkboaJJKmbYSJJ6maYSJK6GSaSpG6GiSSpm2EiSepmmEiSuhkmkqRuhokkqZthIknqZphIkroZJpKkboaJJKmbYSJJ6maYSJK6GSaSpG6GiSSpm2EiSepmmEiSuhkmkqRuhokkqZthIknqZphIkroZJpKkboaJJKlbV5gkOTfJF5P8nyTPJvlXSd6VZF+S59r3Da1tktyVZDnJN5NcNnKc7a39c0m2j9TfnWR/2+euJGn1sX1Ikqaj95PJZ4GvVNXPAP8CeBbYCTxaVVuAR9s6wNXAlva1A7gHhsEA3Aa8B7gcuG0kHO4BPjSy39ZWX60PSdIUTBwmSd4J/AJwL0BV/aCqvgtsA+5vze4Hrm3L24AHaugx4NwkFwDvA/ZV1ZGqeg3YB2xt295RVY9VVQEPrDjWuD4kSVPQ88nkIuAvgP+a5M+S/F6Sc4CFqjrc2rwELLTljcCLI/sfbLW16gfH1FmjD0nSFKzr3Pcy4Fer6vEkn2XF5aaqqiTVM8ATWauPJDsYXlJjYWGBwWAwUR8L6+HWS49NPMYek475ZB09evS09zENzmv2zOvc5nVeK/WEyUHgYFU93ta/yDBMXk5yQVUdbpeqXmnbDwEXjuy/qdUOAUsr6oNW3zSmPWv08SZVtQvYBbC4uFhLS0vjmp3Q3Q/u5s79PT+qyR24cem0Hn8wGDDpz+VM5rxmz7zObV7ntdLEl7mq6iXgxST/vJWuBJ4B9gDHn8jaDuxuy3uAm9pTXVcAr7dLVY8AVyXZ0G68XwU80ra9keSK9hTXTSuONa4PSdIU9P66/avAg0nOBp4HPsgwoB5OcjPwAnBda7sXuAZYBr7f2lJVR5J8CniitftkVR1pyx8GPg+sB77cvgDuWKUPSdIUdIVJVX0DWByz6coxbQu4ZZXj3AfcN6b+JHDJmPqr4/qQJE2HfwEvSepmmEiSuhkmkqRuhokkqZthIknqZphIkroZJpKkboaJJKmbYSJJ6maYSJK6GSaSpG6GiSSpm2EiSepmmEiSuhkmkqRuhokkqZthIknqZphIkroZJpKkboaJJKmbYSJJ6maYSJK6GSaSpG6GiSSpm2EiSepmmEiSuhkmkqRuhokkqZthIknqZphIkroZJpKkboaJJKlbd5gkOSvJnyX5X239oiSPJ1lO8vtJzm71H2/ry2375pFjfKLVv53kfSP1ra22nGTnSH1sH5Kk6TgVn0w+Cjw7sv5p4DNV9dPAa8DNrX4z8Fqrf6a1I8nFwPXAzwJbgd9tAXUW8DngauBi4IbWdq0+JElT0BUmSTYB7wd+r60H+EXgi63J/cC1bXlbW6dtv7K13wY8VFV/U1XfAZaBy9vXclU9X1U/AB4Ctp2gD0nSFPR+Mvkd4D8BP2rr5wHfrapjbf0gsLEtbwReBGjbX2/t/7a+Yp/V6mv1IUmagnWT7pjk3wKvVNVTSZZO3ZBOnSQ7gB0ACwsLDAaDiY6zsB5uvfTYiRueBpOO+WQdPXr0tPcxDc5r9szr3OZ1XitNHCbAe4F/l+Qa4CeAdwCfBc5Nsq59ctgEHGrtDwEXAgeTrAPeCbw6Uj9udJ9x9VfX6ONNqmoXsAtgcXGxlpaWJpro3Q/u5s79PT+qyR24cem0Hn8wGDDpz+VM5rxmz7zObV7ntdLEl7mq6hNVtamqNjO8gf7VqroR+BrwS63ZdmB3W97T1mnbv1pV1erXt6e9LgK2AH8KPAFsaU9und362NP2Wa0PSdIUnI6/M/k48LEkywzvb9zb6vcC57X6x4CdAFX1NPAw8AzwFeCWqvph+9TxEeARhk+LPdzartWHJGkKTsm1m6oaAIO2/DzDJ7FWtvlr4JdX2f924PYx9b3A3jH1sX1IkqbDv4CXJHUzTCRJ3QwTSVI3w0SS1M0wkSR1M0wkSd0ME0lSN8NEktTNMJEkdTNMJEndDBNJUjfDRJLUzTCRJHUzTCRJ3QwTSVI3w0SS1M0wkSR1M0wkSd0ME0lSN8NEktTNMJEkdTNMJEndDBNJUjfDRJLUzTCRJHUzTCRJ3QwTSVI3w0SS1M0wkSR1M0wkSd0ME0lSN8NEktRt4jBJcmGSryV5JsnTST7a6u9Ksi/Jc+37hlZPkruSLCf5ZpLLRo61vbV/Lsn2kfq7k+xv+9yVJGv1IUmajp5PJseAW6vqYuAK4JYkFwM7gUeragvwaFsHuBrY0r52APfAMBiA24D3AJcDt42Ewz3Ah0b229rqq/UhSZqCicOkqg5X1dfb8l8BzwIbgW3A/a3Z/cC1bXkb8EANPQacm+QC4H3Avqo6UlWvAfuArW3bO6rqsaoq4IEVxxrXhyRpCk7JPZMkm4GfAx4HFqrqcNv0ErDQljcCL47sdrDV1qofHFNnjT4kSVOwrvcASX4K+APg16vqjXZbA4CqqiTV28da1uojyQ6Gl9RYWFhgMBhM1MfCerj10mMTj7HHpGM+WUePHj3tfUyD85o98zq3eZ3XSl1hkuTHGAbJg1X1pVZ+OckFVXW4Xap6pdUPAReO7L6p1Q4BSyvqg1bfNKb9Wn28SVXtAnYBLC4u1tLS0rhmJ3T3g7u5c3937k7kwI1Lp/X4g8GASX8uZzLnNXvmdW7zOq+Vep7mCnAv8GxV/fbIpj3A8SeytgO7R+o3tae6rgBeb5eqHgGuSrKh3Xi/CnikbXsjyRWtr5tWHGtcH5KkKej5dfu9wK8A+5N8o9V+A7gDeDjJzcALwHVt217gGmAZ+D7wQYCqOpLkU8ATrd0nq+pIW/4w8HlgPfDl9sUafUiSpmDiMKmqPwGyyuYrx7Qv4JZVjnUfcN+Y+pPAJWPqr47rQ5I0Hf4FvCSpm2EiSepmmEiSuhkmkqRuhokkqZthIknqZphIkroZJpKkboaJJKmbYSJJ6maYSJK6GSaSpG6GiSSpm2EiSepmmEiSuhkmkqRuhokkqZthIknqZphIkroZJpKkboaJJKmbYSJJ6maYSJK6GSaSpG6GiSSpm2EiSepmmEiSuhkmkqRuhokkqZthIknqtm7aA9DaNu/8w9N6/FsvPcYHxvRx4I73n9Z+Jc0XP5lIkrrNbJgk2Zrk20mWk+yc9ngk6e1sJsMkyVnA54CrgYuBG5JcPN1RSdLb16zeM7kcWK6q5wGSPARsA56Z6qjmyOm+V7MW79dIs2dWw2Qj8OLI+kHgPVMai06xUxFkqz1YsBZDTJrcrIbJSUmyA9jRVo8m+faEhzof+MtTM6ozy6/N6dwmmVc+fZoGc2rN5flq5nVusz6vf3IyjWY1TA4BF46sb2q1N6mqXcCu3s6SPFlVi73HORPN69yc1+yZ17nN67xWmskb8MATwJYkFyU5G7ge2DPlMUnS29ZMfjKpqmNJPgI8ApwF3FdVT095WJL0tjWTYQJQVXuBvX9P3XVfKjuDzevcnNfsmde5zeu83iRVNe0xSJJm3KzeM5EknUEMkxOY19e2JDmQZH+SbyR5ctrj6ZHkviSvJPnWSO1dSfYlea593zDNMU5ilXn9ZpJD7bx9I8k10xzjJJJcmORrSZ5J8nSSj7b6TJ+zNeY18+fsZHiZaw3ttS3/F/g3DP8w8gnghqqa+b+0T3IAWKyqWX7+HYAkvwAcBR6oqkta7T8DR6rqjvZLwIaq+vg0x/lWrTKv3wSOVtVvTXNsPZJcAFxQVV9P8g+Bp4BrgQ8ww+dsjXldx4yfs5PhJ5O1/e1rW6rqB8Dx17boDFJVfwQcWVHeBtzflu9n+D/1TFllXjOvqg5X1dfb8l8BzzJ8q8VMn7M15vW2YJisbdxrW+blP44C/neSp9qbAubNQlUdbssvAQvTHMwp9pEk32yXwWbqUtBKSTYDPwc8zhydsxXzgjk6Z6sxTN6+fr6qLmP45uVb2iWVuVTDa7nzcj33HuCfAf8SOAzcOd3hTC7JTwF/APx6Vb0xum2Wz9mYec3NOVuLYbK2k3ptyyyqqkPt+yvA/2B4SW+evNyuYR+/lv3KlMdzSlTVy1X1w6r6EfBfmNHzluTHGP6D+2BVfamVZ/6cjZvXvJyzEzFM1jaXr21Jck67QUiSc4CrgG+tvdfM2QNsb8vbgd1THMspc/wf2+bfM4PnLUmAe4Fnq+q3RzbN9DlbbV7zcM5Ohk9znUB7jO93+LvXttw+5SF1S/JPGX4ageFbEP7bLM8ryReAJYZvZ30ZuA34n8DDwD8GXgCuq6qZupm9yryWGF4uKeAA8B9G7jPMhCQ/D/wxsB/4USv/BsP7CzN7ztaY1w3M+Dk7GYaJJKmbl7kkSd0ME0lSN8NEktTNMJEkdTNMJEndDBNJUjfDRJLUzTCRJHX7fykp35cYKXgmAAAAAElFTkSuQmCC\n",
      "text/plain": [
       "<Figure size 432x288 with 1 Axes>"
      ]
     },
     "metadata": {
      "needs_background": "light"
     },
     "output_type": "display_data"
    }
   ],
   "source": [
    "data['num_caps'].hist()  # most passwords do not have any caps in them"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 49,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-12-14T03:51:52.785108Z",
     "start_time": "2018-12-14T03:51:52.567217Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>text</th>\n",
       "      <th>length</th>\n",
       "      <th>num_caps</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>213231</th>\n",
       "      <td>#</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>740845</th>\n",
       "      <td>m</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>878195</th>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>919079</th>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>206917</th>\n",
       "      <td>6</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "       text  length  num_caps\n",
       "213231    #       1         0\n",
       "740845    m       1         0\n",
       "878195    0       1         0\n",
       "919079    0       1         0\n",
       "206917    6       1         0"
      ]
     },
     "execution_count": 49,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Basic Sorting\n",
    "\n",
    "data.sort_values('length').head()  # default in ascending order"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 50,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-12-14T03:51:52.988379Z",
     "start_time": "2018-12-14T03:51:52.791644Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>text</th>\n",
       "      <th>length</th>\n",
       "      <th>num_caps</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>250716</th>\n",
       "      <td>IhanVitunPitkaSalasanaSaakeli</td>\n",
       "      <td>29</td>\n",
       "      <td>5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>765290</th>\n",
       "      <td>dj-explosion_isan@hotmail.com</td>\n",
       "      <td>29</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>608918</th>\n",
       "      <td>http://ro.netlog.com/ioanamya</td>\n",
       "      <td>29</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>217054</th>\n",
       "      <td>835B2D5E447C64B9A5782DD76353A</td>\n",
       "      <td>29</td>\n",
       "      <td>9</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>851463</th>\n",
       "      <td>0bTW3MNmJ0AWCOkaYmiA7DIVGt8DX</td>\n",
       "      <td>29</td>\n",
       "      <td>17</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                 text  length  num_caps\n",
       "250716  IhanVitunPitkaSalasanaSaakeli      29         5\n",
       "765290  dj-explosion_isan@hotmail.com      29         0\n",
       "608918  http://ro.netlog.com/ioanamya      29         0\n",
       "217054  835B2D5E447C64B9A5782DD76353A      29         9\n",
       "851463  0bTW3MNmJ0AWCOkaYmiA7DIVGt8DX      29        17"
      ]
     },
     "execution_count": 50,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data.sort_values('length', ascending=False).head()  # change the order"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 51,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-12-14T03:51:53.011511Z",
     "start_time": "2018-12-14T03:51:52.993277Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>text</th>\n",
       "      <th>length</th>\n",
       "      <th>num_caps</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>7606374520</td>\n",
       "      <td>10</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>piontekendre</td>\n",
       "      <td>12</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>rambo144</td>\n",
       "      <td>8</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>primoz123</td>\n",
       "      <td>9</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>sal1387</td>\n",
       "      <td>7</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "           text  length  num_caps\n",
       "0    7606374520      10         0\n",
       "1  piontekendre      12         0\n",
       "2      rambo144       8         0\n",
       "3     primoz123       9         0\n",
       "4       sal1387       7         0"
      ]
     },
     "execution_count": 51,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data.head()  # the DataFrame is unchanged without using the inplace parameter"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 52,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-12-14T03:51:53.200666Z",
     "start_time": "2018-12-14T03:51:53.017665Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>text</th>\n",
       "      <th>length</th>\n",
       "      <th>num_caps</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>851463</th>\n",
       "      <td>0bTW3MNmJ0AWCOkaYmiA7DIVGt8DX</td>\n",
       "      <td>29</td>\n",
       "      <td>17</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>661093</th>\n",
       "      <td>EAANt8NPs4oz7rDY_bz3pQHLg--~A</td>\n",
       "      <td>29</td>\n",
       "      <td>12</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>217054</th>\n",
       "      <td>835B2D5E447C64B9A5782DD76353A</td>\n",
       "      <td>29</td>\n",
       "      <td>9</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>250716</th>\n",
       "      <td>IhanVitunPitkaSalasanaSaakeli</td>\n",
       "      <td>29</td>\n",
       "      <td>5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8660</th>\n",
       "      <td>sirmicandrijana96@hotmail.com</td>\n",
       "      <td>29</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                 text  length  num_caps\n",
       "851463  0bTW3MNmJ0AWCOkaYmiA7DIVGt8DX      29        17\n",
       "661093  EAANt8NPs4oz7rDY_bz3pQHLg--~A      29        12\n",
       "217054  835B2D5E447C64B9A5782DD76353A      29         9\n",
       "250716  IhanVitunPitkaSalasanaSaakeli      29         5\n",
       "8660    sirmicandrijana96@hotmail.com      29         0"
      ]
     },
     "execution_count": 52,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# sort by two columns by passing in list as value\n",
    "data.sort_values(['length', 'num_caps'], ascending=False).head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 53,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-12-14T03:51:53.393220Z",
     "start_time": "2018-12-14T03:51:53.205164Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>text</th>\n",
       "      <th>length</th>\n",
       "      <th>num_caps</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>8660</th>\n",
       "      <td>sirmicandrijana96@hotmail.com</td>\n",
       "      <td>29</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>116820</th>\n",
       "      <td>waaaaaaaaaaaaaaaaaaaaaaaaaaaa</td>\n",
       "      <td>29</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>173755</th>\n",
       "      <td>mighty morphing power rangers</td>\n",
       "      <td>29</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>287465</th>\n",
       "      <td>namfon_love_i.p.v@hotmail.com</td>\n",
       "      <td>29</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>608918</th>\n",
       "      <td>http://ro.netlog.com/ioanamya</td>\n",
       "      <td>29</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                 text  length  num_caps\n",
       "8660    sirmicandrijana96@hotmail.com      29         0\n",
       "116820  waaaaaaaaaaaaaaaaaaaaaaaaaaaa      29         0\n",
       "173755  mighty morphing power rangers      29         0\n",
       "287465  namfon_love_i.p.v@hotmail.com      29         0\n",
       "608918  http://ro.netlog.com/ioanamya      29         0"
      ]
     },
     "execution_count": 53,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# each column can have different sorting ascending option\n",
    "data.sort_values(['length', 'num_caps'], ascending=[False, True]).head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 54,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-12-14T03:51:53.411234Z",
     "start_time": "2018-12-14T03:51:53.398158Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "True\n",
      "False\n",
      "True\n",
      "False\n"
     ]
    }
   ],
   "source": [
    "# Let's add some more features to the data by using some basic domain knowledge\n",
    "# Let's make a column called 'common_phrases' that contains either True or False. \n",
    "# The cell will contain True if the passwords contains some common phrase like \"pass\", \"guest\", \"123\", etc\n",
    "# and False otherwise\n",
    "\n",
    "common = ['pass', 'guest', '123', '789', 'admin']\n",
    "\n",
    "def has_common_phrase(my_string):\n",
    "    return any([_ in my_string for _ in common])\n",
    "\n",
    "print has_common_phrase(\"admin123\")\n",
    "print has_common_phrase(\"sfklvjhdnlhsef234235234\")\n",
    "print has_common_phrase(\"guest23\")\n",
    "print has_common_phrase(\"gues23\")\n",
    "    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 55,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-12-14T03:51:54.366939Z",
     "start_time": "2018-12-14T03:51:53.418800Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>text</th>\n",
       "      <th>length</th>\n",
       "      <th>num_caps</th>\n",
       "      <th>has_common_phrase</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>7606374520</td>\n",
       "      <td>10</td>\n",
       "      <td>0</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>piontekendre</td>\n",
       "      <td>12</td>\n",
       "      <td>0</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>rambo144</td>\n",
       "      <td>8</td>\n",
       "      <td>0</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>primoz123</td>\n",
       "      <td>9</td>\n",
       "      <td>0</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>sal1387</td>\n",
       "      <td>7</td>\n",
       "      <td>0</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "           text  length  num_caps  has_common_phrase\n",
       "0    7606374520      10         0              False\n",
       "1  piontekendre      12         0              False\n",
       "2      rambo144       8         0              False\n",
       "3     primoz123       9         0               True\n",
       "4       sal1387       7         0              False"
      ]
     },
     "execution_count": 55,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# use the apply method to make the new columnn\n",
    "data['has_common_phrase'] = data['text'].apply(has_common_phrase)\n",
    "data.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 56,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-12-14T03:51:54.391489Z",
     "start_time": "2018-12-14T03:51:54.372108Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "False    1003782\n",
      "True       44695\n",
      "Name: has_common_phrase, dtype: int64\n"
     ]
    }
   ],
   "source": [
    "# use value counts to count the number of times a password used a common phrase\n",
    "print data['has_common_phrase'].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 57,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-12-14T03:51:54.418882Z",
     "start_time": "2018-12-14T03:51:54.396873Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "False    0.957372\n",
      "True     0.042628\n",
      "Name: has_common_phrase, dtype: float64\n"
     ]
    }
   ],
   "source": [
    "# we can also pass in the normalize optional parameter to show us the percentage of passwords that had a common phrase\n",
    "print data['has_common_phrase'].value_counts(normalize=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 58,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-12-14T03:51:54.428327Z",
     "start_time": "2018-12-14T03:51:54.424004Z"
    }
   },
   "outputs": [],
   "source": [
    "# about 4% of passwords had some commonly used phrase, as defined by us. In later videos, we will get much more\n",
    "# sophisticated about this and use Machine Learning to find common phrases automatically"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 59,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-12-14T03:51:54.600678Z",
     "start_time": "2018-12-14T03:51:54.433159Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<matplotlib.axes._subplots.AxesSubplot at 0x1178a1b10>"
      ]
     },
     "execution_count": 59,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAW8AAADuCAYAAAD2p4bdAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMi4zLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvIxREBQAAHFdJREFUeJzt3Xu8VXWd//HXZ3P1AoqKgiAsBLwhIqICamVKabMmJ6uxTGseNmOj+Rurmczl1K/4pdWqmd9UNmU2TZY2lZP3XM1o4SUvOV4R4SCKurgoiiKCglzOOd/5Y23gQEc565y99nevvd/Px2M/Dqz2PudtD3y7+K7vxZxziIhIuVR8BxARkfxU3iIiJaTyFhEpIZW3iEgJqbxFREpI5S0iUkIqbxGRElJ5i4iUkMpbRKSEVN4iIiWk8hYRKSGVt4hICam8RURKSOUtIlJCKm8RkRJSeYuIlJDKW0SkhFTeIiIlpPIWESkhlbeISAmpvEVESkjlLSJSQipvEZESUnmLiJSQyltEpIRU3iIiJaTyFhEpIZW3iEgJqbxFREpI5S0iUkL9fQcQ6asgSgYC+wB7A4OAAWR/trt+7e7alj//G4A3d/i6Hni9+lqbxuH6Ov3jiPSIOed8ZxDZThAlBhxQfQ2vvvbZ4dX12pA6xGqnWuTAS8DzO7yWb/l1Gofr6pBHWpzKW7wIoqQCjAEmAhO6vCYC44DB/tL12Rq2L/bngDZgPrA4jcN2j9mkSai8pVBBlAwCJgNTgcPYvqAHeozmyyZgEVmRL6i+5gPPpnHY6TOYlIvKW2qmOvY8FTgamAYcRVbYA3zmKok3gSfJivxx4D7gkTQON3tNJQ1L5S29FkRJAMzo8jqS7IGh1MZ64EHgHuBe4I9pHL7uN5I0CpW39FgQJcOAWcApwHvJHihK/XSQ3ZXfS1bo96Rx+JLfSOKLylveUhAl/YDpZGV9CtlwSD+voWRHi4E7gAT4vaY0tg6Vt2wniJIxbCvrk4E9/SaSHDYAdwG3Akkah6nXNFIolbcQRMkhwFnAh4FDPMeR2nkcuAG4IY3D+b7DSG2pvFtUECWjgDOBj5HNEJHm9hRZkV+XxuEjvsNI36m8W0j1geNfkhX2O9DeNq1qPvDvwDVpHK7yHUZ6R+Xd5IIo2QU4jaywT6U1F8ZI9zYBN5MV+e+0SKhcVN5NKoiSicBngE9Qn70/pNyWAj8FrtKDznJQeTeZIEreA3wWeB9gnuNI+ThgDtnd+I1pHG70nEfegsq7CVSHRj4OXAhM8hxHmsdK4DvA99M4XOs7jGxP5V1iQZSMBi4AziXby1qkCGuAHwDfSeNwpe8wklF5l1AQJdOAi4APoQM1pH7eJBtO+ac0Dpf6DtPqVN4lEkTJEcClZLNHRHzZDPwC+GYahwt9h2lVKu8SCKLkUGA22RxtPYSURuGAm4DL0jh81HeYVqPybmDVMe1Lyab7aUGNNCoHXANcksbhC77DtAqVdwMKomQP4BKyedplPg5MWss64BvA/0/jcIPvMM1O5d1AgigZAHwa+BLZwboiZZQCF6VxeJ3vIM1M5d0ggig5Fvgx2XmPIs3gbuCzaRzO9R2kGam8PQuiZDfgMrIFNhrXlmbTSTa98EuaI15bKm+Pgig5BfghEHiOIlK0NcAX0jj8ke8gzULl7UEQJXuTLTs+23cWkTq7HfjrNA6X+w5Sdvprep0FUXIWsBAVt7Sm9wLzgyg5x3eQstOdd50EUTKS7IHkn/nOItIgbiG7C3/Fd5AyUnnXQRAl7wSuBUb4ziLSYFYAf5XG4e98BykblXfBgii5CPg62kBK5K044F+Af0zjcJPvMGWh8i5IECVDgauAD/rOIlISDwN/oSX2PaPyLkAQJZOB64GJvrOIlMzzwGna6GrnNNukxqqzSR5AxS3SG6OAe4Io0d9Yd0J33jVS3ZfkO2R7k4hI3ziyVZlf9x2kUam8a6B6huQNwKm+s4g0mauBc/Ug80+pvPuo+mDyVuAdvrOINKl7gdM1H3x7Ku8+CKJkH+C/gWm+s4g0ueeA96VxuMh3kEbR4weWljnbzL5c/f0YMzu2uGiNLYiS/cm2vFRxixRvHHBnECWaCFDV4ztvM7uCbHvHk5xzh5rZMOB259wxRQZsREGUHAj8nuwPlIjUz3LgXWkcPus7iG95pgpOd85dAGwAcM6tBgYWkqqBBVFyGHAPKm4RH0YDdwRRMsZ3EN/ylPdmM+tHNoUHMxtOdifeMoIoORL4A7C/7ywiLWwsWYGP8h3EpzzlfTlwI7CvmX2N7Alwy8zBDKJkHPBfwN6+s4gI48kKvGU3e8s128TMDgFOBgyY45xbWFSwRlI9POF+4CDfWURkO23AiWkcvuw7SL3leWA5HljunNtoZicCRwBXO+deKzCfd0GU7ArMAWb4ziIi3ZoHvCONw7W+g9RTnmGT64EOM5sAXAkcAPyikFQNIogSA36OilukkR0B/Kz672vLyFPenc65drItTv/VOXcRMLKYWA3jMuB03yFEZKc+AFzsO0Q95Z1tcibwCbLl4AADah+pMQRRcibwj75ziEiPXRZEycm+Q9RLnvI+B5gJfM0595yZjQOuKSaWX0GUHA38xHcOEcmlH/CrIEoO8B2kHrS3yQ6qDyjnov24RcrqIbIHmBt9BylSnr1NJprZdWbWZmbPbnkVGc6Tb6LiFimzY8jWpTS1PMMmVwFXAO3Au8n22f15EaF8CaLkJOAC3zlEpM8+FUTJOb5DFCnPPO9HnHPTzOwJ59zkrtcKTVgn1X2555EtvRWR8nsDmJTG4VLfQYqQ5857o5lVgKfN7P+Y2enA7gXl8uHbqLhFmsnuwA98hyhKnjvvY4CFwJ7ApcBQ4J+ccw8UF68+gigJ2Tb9UUSay0fTOLzWd4ha61F5V3cT/KZz7vPFR6qvIEr2AubT/AuORFrVSuCQNA5X+w5SSz0aNnHOdQAnFJzFl2+g4hZpZvsCs32HqLW8J+mMAn4NrNty3Tl3QzHRihdEyXjgSaC/7ywiUqh2YEoah22+g9RKntIaDKwCTupyzQGlLW/gy6i4RVpBf7JJCaf4DlIrLbvCMoiSg4EFZEtqRaQ1hGkc/tZ3iFro8V1n9dizc4Gg6+ecc5+sfay6+H+ouEVazReApijvPGPe95MdvPsI0LHlunPu+mKiFSeIksnA42QnAolIa5mWxuGjvkP0VZ7x3l2dc82yX+5XUXGLtKq/B872HaKv8qywvNXM/qywJHUSRMk0so3bRaQ1ndEMJ8/vtLzN7HUzWwt8hqzAN1SvbbleNn/nO4CIeDUAuNB3iL5qqdkmQZTsArwEDPGdRUS8eg0Yncbhup2+s0HlmuNsZh8kW2npgHucczcVkqo4p6HiFpFsj6ZPAt/zHaS38hzG8APgPOAJsr1AzjOz7xcVrCBn+Q4gIg3jfN8B+iLPVMEngUNd9QPV7WEXOOcOLTBfzQRRsjewgiY+NFlEcpuQxuEzvkP0Rp7ZJouBMV1+f0D1WlmcgYpbRLYX+g7QW3nKewiw0MzuMrM7gTZgqJndYma3FBOvpko/r1NEaq605Z1n2ORdO3nLHWTj4Vt8wDmXvsX3CoBbnXOH9+iH91EQJeOAZjwsWUT6ZiOwdxlnnfR4tolz7u63+9/NDOfckX2PVIhTfQcQkYY0CJgF3Ow7SF55hk1yM7PAzO4xs0err+O6ec8kM3vQzOaa2Twzm1i9fnaX61dWT/PpreP78FkRaW6lHDqpZXlXqkU718xurF5bCbzHOXcU8BHg8m4+dx7w3epd+9HAcjM7tPr+46vXO+jbNL9mPQVIRPqulNt+1PIggs5uhk0GAP9qZlsK+KBuPvdH4ItmNhq4wTn3tJmdDEwDHjIzgF3I/kOQW3UPA50KLyJvZVQQJYekcfik7yB5FH2KzOfIlqNPIbvL37DjG5xzvzCz/yH7q8tvzexvyXb8+5lz7pIaZDimBt9DRJrbwWRHIpZGLYdNNnZzbQ9ghXOuE/g43Rx+YGYHAs865y4ne2hwBDAH+LCZ7Vt9z15m1tu75ym9/JyItI6JvgPklWd5/AfN7GkzW2Nma7vZVbCzm4/9APgrM3scOIQuBxd3cQYw38zmAocDVzvn2oAvAbeb2Tzgd/T+hPcjevk5EWkd3Q3pNrQ887wXA+93zi0sNlJtBVGyGBjvO4eINLS70jh8t+8QeeQZNnmpbMVdpYeVIrIzpRs2yXPn/V1gBHATXca3nXM3FBOt74IoGQKU8cAIEakvB+yexuF630F6Ks9sk6HAeuC9Xa45oGHLm+yBqYjIzhgwAZjnO0hP5Vkef06RQQqyp+8AIlIaB1Ki8s4z22S0md1oZiurr+urC2samcpbRHpqV98B8sjzwPIq4BZg/+rrN9VrjUzlLSI9NdB3gDzylPdw59xVzrn26uunwPCCctWKyltEeqpU5Z3ngeUqMzsb+GX192cCq2ofqab0wFL+RH/aNx9mS9IZlYUr97XVHb7zSGNY5YZuLNMGg3nKe8tJy98mm2VyP9DoDzH7so2sNKl2+g+Y58ZPnNcxfuIoXn7x2MqTy2dWFrwxtfLMwNH28vDBbArMdGReC/ol/Mh3hh7LM9tkCXBagVmK8IrvANLYnmf4iBs7h4+4sfMdW6/1p33zJFvy9IxK28oZlbb2wypLdtuHtaP6WWdvt2iQcmj3HSCPHpe3mY0D/g4Iun7OOdfIhf6y7wBSPu30H/C4Gz/x8Y7xE6/seP/W63vy+upplaeWHF9ZsPboyiIbZy/utTtvBmbs5jGu1M6f7HrayPIMm9wE/DvZLJPuNqFqRCpvqZnXGDJsTue0YXM6p3W56tx4e2Hp9MrCF46rtG2cbM8OGmmvjhhA+xizYk+qkpp7yXeAPPKU94bqtq1l0qsDHER6zuwZN2rMMx2jxvyiY9bWq7uwcf2UyuJ0ZqVt1fTKk26iLR8yjDfGVszt5TGsvL3nfQfII8/eJh8j27zldrbf2+TRYqL1XRAlA+l+n3ERL0ay6qWjK4uWHV9ZsG5q5en+B9jK4btkD0hLNU2tSQ1j9prXfIfoqTzl/Q2yAxWeYduwiXPOnVRQtpoIouQ1NGVQGlg/OtoPsaXpzErbypmVts2HVZbsOpzXRvW3zv19Z2sh65m9plTPLvIMm/wlcKBzblNRYQqyEpW3NLAO+vVf4MZNWNAxbsKPO7bNMx7KG2umVZ5Oj6ssWHN0ZRHjbcVeQ1g/1owhHuM2qxd8B8grT3nPJ1uxWLZx5GWUcK9ekbXsvsednVOn3Nk5tctV5wJ7cdn0ysIVx1UWvHmEPTtwf1s1YmD2gFTrGnovzfsBM9ub7MhGyLbL7mDbJIlji77RzVPeewJPmtlDbD/m3chTBQEeABp6aEek58xSN/KAtGPkAdd2bPtjPZiNb06259KZlQWrplee7Di4smzoXrx+QMXcPh7DlsnjeT/gnFsFHAlgZrOBN5xz/9z1PWZmZMPTNZ+hl6e8v1LrH14n9/kOIFK0DQza5SF3yKEPdRyS3f9V7cvql4+pLFo2s7Lg9aMqT/cfay/tvSsbx5kxyF/ahvRYrb6RmU0g28TvMWAq8D4ze9w5t2f1f/8oMMs59zdmth9wBTCG7Fnihc65B3ryc/KssLy7+oOH5vlcA/gj2XJ+8x1EpN5WMmx40jljeNI5Y+u1Cp0dB9uyZ2dW2l6aUWnbOKmS7rofq0f1t85RHqP69nCNv98hwCeccw+b2dv15eXAt5xzD5hZANxKdhD7TuVZYfkp4Ktkq5A6ycrQkW1g3rDSOFwdREkbMMl3FpFG0Eml30I39sCFHWMP/EnH+7Ze3531a4+qPL3k+MqC146uLHIT7IVhQ1k31oyhHuPWwyrgqRp/z2eccz35D8Is4OBsdAWAYWa2i3PuzZ19MM8d9EXA4c65Mu4Xcj8qb5G39Qa7Dv1D55TJf+icst31MfbS89MrC1+YWWlbN8WeGTTaXtl3IJuDJnpA+gCz1/RsznTPrevy6y03u1sM7vJro5cPN/OU9zNkZ1iW0X3Aub5DiJTRUrffqKUd+436dceJW68NZPPGw+25xTMrbS9PryzsOLSydMherB3dz9y+/pL22pydv6X3nHOdZrbazCaS9ejpbJuV8nvgArLdWjGzI51zc3vyffOU9yXA/Wb2P2w/2+TCHN/DFz20FKmhTQwY9Kg76OBHOw46+PsdH9h6fW/WvHJMZdGy4yoL1k6rPNVvrL20125sGGfGLh7j7sxNdfgZFwO3kU21fgS2PjC+ALjCzM4h6+M7q9d2Ks8KyweBe4En6LIxlXPuZz0M71UQJcuARj9zU6TpGJ2dB9nyJdMrC1+cWWnbNLny3C77sXpEfzoOMPM+kWAes9dM2fnbGk+eO+8Bzrm/LyxJ8a4F/sF3CJFW46hUFrkx4xZ1jBl3dccpW6/vxptvTK0sTmdWFqw+trLITbDn99iDN8ZWrK7HF95Yx59VU3nuvL9OtgrpN2w/bPJqIclqLIiSo8j+uiIiDWy0vbziGHty+XGVBeunVhYPGG0vDx+UPSAt4nSjqcxe06Mx5kaTp7yf6+ayc8419FTBroIoWUg2/1JESmQA7ZsmWZrOqLS9PKPS1n5oZelu+7B2dD/rHNGHb5sye824moWsszyLdEr7D9nFNcDXfIcQkXw203/gXDfhoLkdEw76Yce2HTmGsfbVoytPLa0+IN1yutE4M3btwbf9aWGB6yDPnfcA4HzgndVLdwFXOuc2FxOt9oIoGUG2UVWZVoiKSA5GZ+eBtmLZjMrCF4+rLNgw2Z4bNMJWjRyQPSDdcrpRBzCW2WtKdQBDV3nK+8fAAGDL7JKPAx3Oub8pKFshgii5DviQ7xwiUl+7smHdEZVn0uMqC1YfZU8/dsKl95dhmvNbynMHeoxzruuUmjvMLPdOXA3gh6i8RVrOegbv9kDnpEkPdE4C+GLqOU9f5TkgtcPMxm/5jZkdyHb7l5XGHLK9yUWkNT2cxuEffIfoqzzlfRFwp5ndZWZ3A3dQwnnTaRw64Iu+c4iIN//iO0At9HjMG8DMBgEHV3+7yDlX2sN9gyi5Fzjedw4RqavngIPSOGz3HaSvenznbWYXALs45+Y55+YBu5rZp4uLVriLfQcQkbr7fDMUN+SbbTLXOXfkDtcec85NfavPNLogSm4GGv0YNxGpjTlpHM7yHaJW8ox597MuO4abWT9gYO0j1dUllPOhq4jk0w58xneIWspT3v8NXGtmJ5vZycAvq9dKK43DNuBq3zlEpHBXpHG4wHeIWspT3heTzTA5v/qaA3yhiFB19mWyo91EpDm9QnkPUH9LuWabvO03MrveOVfKxS9BlETAN3znEJFCnJfG4ZW+Q9RanjvvnSnN7oLd+BZQ+kn7IvIn5gL/5jtEEWpZ3rU+wLNu0jjsJNurZY3vLCJSM5uAc6v/fjedWpZ3qaVxuJQenh0nIqXw+TQOH/Ydoii1LG/fZ9H1WRqH/0E2i0ZEyu26NA6/5ztEkXpV3mY2zMyO2OFys6xYPB9Y6juEiPTaM8Bf+w5RtDwrLO8iW43Yn+wsyJXAfSU/lLhbQZS8E7gTDSuJlM1GYGYah4/5DlK0POW0h3NuLfBB4Grn3HSgaZaadlXdLlLHpYmUz2dbobghX3n3N7ORwBnArQXlaSRfAX7uO4SI9Niv0jj8oe8Q9ZKnvL8K3AYsds49VD2M4eliYvlX3ff7k8DvfWcRkZ2aD3zKd4h6qtkKy2YVRMkQsgU8R+7svSLixWLgHWkcvug7SD3leWA5mOwJ7iRg8JbrzrlPFhOtcQRRMhL4IzDWdxYR2c5y4IQ0Dpf4DlJveYZNrgFGAKcAdwOjgdeLCNVo0jhcAZwKvOo7i4hs9TIwqxWLG/LdeT/mnJtqZvOcc0eY2QDgHufcjGIjNo4gSo4j201x8M7eKyKFeg14dxqHc30H8SXPnffm6tfXzOxwYA9g39pHalxpHN4PnMm2/y9EpP7WAWErFzfkK+8fmdkw4EvALUAb8M1CUjWwNA5vAk5He4CL+LAR+ED1Rqql5Rk2GQR8CAiAAdXLzjn31WKiNbYgSk4EfgPs7jmKSKvYAHwkjcNbfAdpBHnuvG8G/oLsLLg3qq91RYQqgzQO7yJbYbracxSRVrAaeI+Ke5s8d97znXOHF5yndIIoOQz4L2CM7ywiTWoZcGr1zFmpynPnfb+ZTS4sSUlV/0DNBB73nUWkCc0n22hKxb2Dnd55m9kTZKfk9AcmAs+SPTQwsjHvHbeGbUlBlAwFrqdJN+sS8SABzkzjsCXWk+TVk/J+21WFzrmWnCDfnSBKBgDfAT7tO4tIyX2b7CScpjzCrBa0t0kBgig5g+zQ06G+s4iUzGbggjQOm/LQ4FpSeRckiJIJwK/RhlYiPfUscFYahw/4DlIGOimmIGkcLiZ7kHml7ywiJXA1cKSKu+d0510HQZR8FPgRMMR3FpEGswY4L43DX/kOUjYq7zoJouQg4D+BKb6ziDSIe4CPt+qugH2lYZM6SePwKWAGcDmgJ+jSytqB/0u2K6CKu5d05+1BECXHkA2j6GGmtBo9lKwR3Xl7kMbhQ8DRwD/QwvvDSEvZBHwLmKLirg3deXsWRMkY4HvAab6ziBTkNuDC6tCh1IjKu0EEUXI62Xj4aN9ZRGrkOeBzaRze7DtIM9KwSYNI4/BG4DDgu0CH5zgiffEm8BXgMBV3cXTn3YCCKJkMXEq2f7pImdxIdretWSQFU3k3sOqslEuBU3xnEdmJucDFaRze7jtIq1B5l0AQJScAlwHv8p1FZAcPApemcXir7yCtRuVdIkGUzCIr8em+s0jLu4+stG/zHaRVqbxLKIiSPycbTtEiH6m3O8lK+07fQVqdyrukgigx4P3A54AT/aaRFnAbWWnf5zuIZFTeTSCIkinAZ4CPAYM8x5HmsRm4CfjnNA4f9B1GtqfybiJBlOwL/C3wKbTYR3rvebK9d/4tjcMVvsNI91TeTSiIkn7AnwPnA+8lOyxa5O10AnOAK4Bb0jjUQrEGp/JuckGUHEh2N34WMMpzHGk8S4CrgJ9qYU25qLxbRPUB5wnAR4APA/v5TSQerQN+A/wEmKMT2stJ5d2CqsMq7yIr8g8Be/tNJHWwGrgVuAG4LY3DNz3nkT5Sebe4IEr6AyeTFfnpwJ5+E0kNvUg2W+QG4M40Dts955EaUnnLVkGUDCTbR+X9wCxgnN9E0gvPkm0OdQPwgIZEmpfKW95S9WHnrOrrJDS80ojWkS1Vvxv4bRqHcz3nkTpReUuPVB94TiUr8veQPfwc7DVUa1oL3EtW1ncDj2g4pDWpvKVXgigZDBxPdkd+DDAN2MtrqOb0GnAP28r6Mc3BFlB5Sw0FURIAR5EV+Zavw31mKplXgSeAedXXw8A8jVtLd1TeUqggSkazrcy3vPb3Gsq/zcAitpX0E2QlvdxrKikVlbfUXRAlQ4AJwMQuXycCATCS5jhbtRNYAaRkB/GmwFNkZb0wjcNN3pJJU1B5S0OpTlc8ABjb5TUSGEY2pt71NcRDxHbgdbIHhyuAF8g2cnq+y6+XAktV0FIklbeUVnWBUXelvifQn+wOvrtXv26ubWZbKe/4deuvtTJRGoXKW0SkhJphbFFEpOWovEVESkjlLSJSQipvEZESUnmLiJSQyltEpIRU3iIiJaTyFhEpIZW3iEgJqbxFREpI5S0iUkIqbxGRElJ5i4iUkMpbRKSEVN4iIiWk8hYRKSGVt4hICam8RURKSOUtIlJCKm8RkRJSeYuIlJDKW0SkhFTeIiIlpPIWESkhlbeISAmpvEVESkjlLSJSQipvEZESUnmLiJSQyltEpIRU3iIiJfS/FVmG2GoTOsYAAAAASUVORK5CYII=\n",
      "text/plain": [
       "<Figure size 432x288 with 1 Axes>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "# pandas can also create pie charts. Let's plot the value counts of has_common_phrase\n",
    "data['has_common_phrase'].value_counts().plot.pie()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 60,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-12-14T03:51:54.643366Z",
     "start_time": "2018-12-14T03:51:54.605644Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>text</th>\n",
       "      <th>length</th>\n",
       "      <th>num_caps</th>\n",
       "      <th>has_common_phrase</th>\n",
       "      <th>percent_caps</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>7606374520</td>\n",
       "      <td>10</td>\n",
       "      <td>0</td>\n",
       "      <td>False</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>piontekendre</td>\n",
       "      <td>12</td>\n",
       "      <td>0</td>\n",
       "      <td>False</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>rambo144</td>\n",
       "      <td>8</td>\n",
       "      <td>0</td>\n",
       "      <td>False</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>primoz123</td>\n",
       "      <td>9</td>\n",
       "      <td>0</td>\n",
       "      <td>True</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>sal1387</td>\n",
       "      <td>7</td>\n",
       "      <td>0</td>\n",
       "      <td>False</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "           text  length  num_caps  has_common_phrase  percent_caps\n",
       "0    7606374520      10         0              False           0.0\n",
       "1  piontekendre      12         0              False           0.0\n",
       "2      rambo144       8         0              False           0.0\n",
       "3     primoz123       9         0               True           0.0\n",
       "4       sal1387       7         0              False           0.0"
      ]
     },
     "execution_count": 60,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Let's add a new column called percent_caps which the percent of caps of the total password length.\n",
    "# We can achieve this by dividing two of our columns (numn_caps / length)\n",
    "\n",
    "# in pandas we can accomplish this by doing\n",
    "data['percent_caps'] = data['num_caps'] / data['length']\n",
    "\n",
    "data.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 61,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-12-14T03:51:54.686545Z",
     "start_time": "2018-12-14T03:51:54.647631Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>text</th>\n",
       "      <th>length</th>\n",
       "      <th>num_caps</th>\n",
       "      <th>has_common_phrase</th>\n",
       "      <th>percent_caps</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>EVASLRDG</td>\n",
       "      <td>8</td>\n",
       "      <td>8</td>\n",
       "      <td>False</td>\n",
       "      <td>1.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>151</th>\n",
       "      <td>ANKEN</td>\n",
       "      <td>5</td>\n",
       "      <td>5</td>\n",
       "      <td>False</td>\n",
       "      <td>1.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>223</th>\n",
       "      <td>_CECILE</td>\n",
       "      <td>7</td>\n",
       "      <td>6</td>\n",
       "      <td>False</td>\n",
       "      <td>0.857143</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>259</th>\n",
       "      <td>PITBIKE</td>\n",
       "      <td>7</td>\n",
       "      <td>7</td>\n",
       "      <td>False</td>\n",
       "      <td>1.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>285</th>\n",
       "      <td>ALLOHUAKBAR!</td>\n",
       "      <td>12</td>\n",
       "      <td>11</td>\n",
       "      <td>False</td>\n",
       "      <td>0.916667</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "             text  length  num_caps  has_common_phrase  percent_caps\n",
       "5        EVASLRDG       8         8              False      1.000000\n",
       "151         ANKEN       5         5              False      1.000000\n",
       "223       _CECILE       7         6              False      0.857143\n",
       "259       PITBIKE       7         7              False      1.000000\n",
       "285  ALLOHUAKBAR!      12        11              False      0.916667"
      ]
     },
     "execution_count": 61,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# passwords with over 80% of caps in them\n",
    "data[data['percent_caps'] > .8].head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 62,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-12-14T03:51:56.230409Z",
     "start_time": "2018-12-14T03:51:54.692583Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>text</th>\n",
       "      <th>length</th>\n",
       "      <th>num_caps</th>\n",
       "      <th>has_common_phrase</th>\n",
       "      <th>percent_caps</th>\n",
       "      <th>special_characters</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>7606374520</td>\n",
       "      <td>10</td>\n",
       "      <td>0</td>\n",
       "      <td>False</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>piontekendre</td>\n",
       "      <td>12</td>\n",
       "      <td>0</td>\n",
       "      <td>False</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>rambo144</td>\n",
       "      <td>8</td>\n",
       "      <td>0</td>\n",
       "      <td>False</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>primoz123</td>\n",
       "      <td>9</td>\n",
       "      <td>0</td>\n",
       "      <td>True</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>sal1387</td>\n",
       "      <td>7</td>\n",
       "      <td>0</td>\n",
       "      <td>False</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "           text  length  num_caps  has_common_phrase  percent_caps  \\\n",
       "0    7606374520      10         0              False           0.0   \n",
       "1  piontekendre      12         0              False           0.0   \n",
       "2      rambo144       8         0              False           0.0   \n",
       "3     primoz123       9         0               True           0.0   \n",
       "4       sal1387       7         0              False           0.0   \n",
       "\n",
       "   special_characters  \n",
       "0                   0  \n",
       "1                   0  \n",
       "2                   0  \n",
       "3                   0  \n",
       "4                   0  "
      ]
     },
     "execution_count": 62,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Another column. Let's count the number of special characters in the password\n",
    "\n",
    "special = \"!@#$%^&*()+=_-\"\n",
    "\n",
    "def count_special_characters(my_string):\n",
    "    return sum([1 for _ in my_string if _ in special])\n",
    "\n",
    "data['special_characters'] = data['text'].apply(count_special_characters)\n",
    "\n",
    "data.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 63,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-12-14T03:51:56.302639Z",
     "start_time": "2018-12-14T03:51:56.236032Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>text</th>\n",
       "      <th>length</th>\n",
       "      <th>num_caps</th>\n",
       "      <th>has_common_phrase</th>\n",
       "      <th>percent_caps</th>\n",
       "      <th>special_characters</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>1372</th>\n",
       "      <td>@stewart@@</td>\n",
       "      <td>10</td>\n",
       "      <td>0</td>\n",
       "      <td>False</td>\n",
       "      <td>0.0</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1812</th>\n",
       "      <td>hongnam@@@</td>\n",
       "      <td>10</td>\n",
       "      <td>0</td>\n",
       "      <td>False</td>\n",
       "      <td>0.0</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2194</th>\n",
       "      <td>effie___</td>\n",
       "      <td>8</td>\n",
       "      <td>0</td>\n",
       "      <td>False</td>\n",
       "      <td>0.0</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3794</th>\n",
       "      <td>po&amp;po444==</td>\n",
       "      <td>10</td>\n",
       "      <td>0</td>\n",
       "      <td>False</td>\n",
       "      <td>0.0</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3796</th>\n",
       "      <td>zhan!@#$%^</td>\n",
       "      <td>10</td>\n",
       "      <td>0</td>\n",
       "      <td>False</td>\n",
       "      <td>0.0</td>\n",
       "      <td>6</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "            text  length  num_caps  has_common_phrase  percent_caps  \\\n",
       "1372  @stewart@@      10         0              False           0.0   \n",
       "1812  hongnam@@@      10         0              False           0.0   \n",
       "2194    effie___       8         0              False           0.0   \n",
       "3794  po&po444==      10         0              False           0.0   \n",
       "3796  zhan!@#$%^      10         0              False           0.0   \n",
       "\n",
       "      special_characters  \n",
       "1372                   3  \n",
       "1812                   3  \n",
       "2194                   3  \n",
       "3794                   3  \n",
       "3796                   6  "
      ]
     },
     "execution_count": 63,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# passwords with 3+ special characters\n",
    "data[data['special_characters'] > 2].head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 64,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-12-14T03:51:56.345932Z",
     "start_time": "2018-12-14T03:51:56.307751Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>text</th>\n",
       "      <th>length</th>\n",
       "      <th>num_caps</th>\n",
       "      <th>has_common_phrase</th>\n",
       "      <th>percent_caps</th>\n",
       "      <th>special_characters</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>53</th>\n",
       "      <td>xutao123</td>\n",
       "      <td>8</td>\n",
       "      <td>0</td>\n",
       "      <td>True</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>61</th>\n",
       "      <td>amg123</td>\n",
       "      <td>6</td>\n",
       "      <td>0</td>\n",
       "      <td>True</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>108</th>\n",
       "      <td>12345614</td>\n",
       "      <td>8</td>\n",
       "      <td>0</td>\n",
       "      <td>True</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>117</th>\n",
       "      <td>passme</td>\n",
       "      <td>6</td>\n",
       "      <td>0</td>\n",
       "      <td>True</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>188</th>\n",
       "      <td>ae12345</td>\n",
       "      <td>7</td>\n",
       "      <td>0</td>\n",
       "      <td>True</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "         text  length  num_caps  has_common_phrase  percent_caps  \\\n",
       "53   xutao123       8         0               True           0.0   \n",
       "61     amg123       6         0               True           0.0   \n",
       "108  12345614       8         0               True           0.0   \n",
       "117    passme       6         0               True           0.0   \n",
       "188   ae12345       7         0               True           0.0   \n",
       "\n",
       "     special_characters  \n",
       "53                    0  \n",
       "61                    0  \n",
       "108                   0  \n",
       "117                   0  \n",
       "188                   0  "
      ]
     },
     "execution_count": 64,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Let's use some more complicted filtering to see some not so great passwords.\n",
    "\n",
    "notsogreat = data[(data['length'] <= 8) &  # length of password is 8 characters or less\n",
    "                  (data['has_common_phrase'] == True) &  # has some common phrase\n",
    "                  (data['special_characters'] == 0) &  # has no special characters\n",
    "                  (data['percent_caps'] == 0)]  # percentage of uppercase characters is 0%\n",
    "\n",
    "notsogreat.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 65,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-12-14T03:51:56.372972Z",
     "start_time": "2018-12-14T03:51:56.351577Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(15252, 6)"
      ]
     },
     "execution_count": 65,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# over 15K not so great passwords\n",
    "notsogreat.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 66,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-12-14T03:51:56.419793Z",
     "start_time": "2018-12-14T03:51:56.378611Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>text</th>\n",
       "      <th>length</th>\n",
       "      <th>num_caps</th>\n",
       "      <th>has_common_phrase</th>\n",
       "      <th>percent_caps</th>\n",
       "      <th>special_characters</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>46929</th>\n",
       "      <td>$1$A$OsW5TjmIcgkbKylJ6NkWP</td>\n",
       "      <td>26</td>\n",
       "      <td>10</td>\n",
       "      <td>False</td>\n",
       "      <td>0.384615</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>143866</th>\n",
       "      <td>IBM!_DEC@_SUN#_HP$</td>\n",
       "      <td>18</td>\n",
       "      <td>11</td>\n",
       "      <td>False</td>\n",
       "      <td>0.611111</td>\n",
       "      <td>7</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>246681</th>\n",
       "      <td>!Q!W@E#R$T%Y^U&amp;I*</td>\n",
       "      <td>17</td>\n",
       "      <td>8</td>\n",
       "      <td>False</td>\n",
       "      <td>0.470588</td>\n",
       "      <td>9</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>273464</th>\n",
       "      <td>Sainte-Marie-La-Blanche</td>\n",
       "      <td>23</td>\n",
       "      <td>4</td>\n",
       "      <td>False</td>\n",
       "      <td>0.173913</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>288413</th>\n",
       "      <td>Happy Birthday!!!</td>\n",
       "      <td>17</td>\n",
       "      <td>2</td>\n",
       "      <td>False</td>\n",
       "      <td>0.117647</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                              text  length  num_caps  has_common_phrase  \\\n",
       "46929   $1$A$OsW5TjmIcgkbKylJ6NkWP      26        10              False   \n",
       "143866          IBM!_DEC@_SUN#_HP$      18        11              False   \n",
       "246681           !Q!W@E#R$T%Y^U&I*      17         8              False   \n",
       "273464     Sainte-Marie-La-Blanche      23         4              False   \n",
       "288413           Happy Birthday!!!      17         2              False   \n",
       "\n",
       "        percent_caps  special_characters  \n",
       "46929       0.384615                   3  \n",
       "143866      0.611111                   7  \n",
       "246681      0.470588                   9  \n",
       "273464      0.173913                   3  \n",
       "288413      0.117647                   3  "
      ]
     },
     "execution_count": 66,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "betterpasswords = data[(data['length'] > 16) &  # length of password if 17 characters or more\n",
    "                       (data['has_common_phrase'] == False) &  # has no common phrase\n",
    "                       (data['special_characters'] > 2) &  # has 3 or more special characters\n",
    "                       (data['percent_caps'] > .1)] # percentage of uppercase characters is higher than 10%\n",
    "\n",
    "betterpasswords.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 67,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-12-14T03:51:56.428480Z",
     "start_time": "2018-12-14T03:51:56.422593Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(15, 6)"
      ]
     },
     "execution_count": 67,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# only 15 of them!\n",
    "betterpasswords.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 68,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-12-14T03:51:56.459199Z",
     "start_time": "2018-12-14T03:51:56.432628Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>text</th>\n",
       "      <th>length</th>\n",
       "      <th>num_caps</th>\n",
       "      <th>has_common_phrase</th>\n",
       "      <th>percent_caps</th>\n",
       "      <th>special_characters</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>46929</th>\n",
       "      <td>$1$A$OsW5TjmIcgkbKylJ6NkWP</td>\n",
       "      <td>26</td>\n",
       "      <td>10</td>\n",
       "      <td>False</td>\n",
       "      <td>0.384615</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>143866</th>\n",
       "      <td>IBM!_DEC@_SUN#_HP$</td>\n",
       "      <td>18</td>\n",
       "      <td>11</td>\n",
       "      <td>False</td>\n",
       "      <td>0.611111</td>\n",
       "      <td>7</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>246681</th>\n",
       "      <td>!Q!W@E#R$T%Y^U&amp;I*</td>\n",
       "      <td>17</td>\n",
       "      <td>8</td>\n",
       "      <td>False</td>\n",
       "      <td>0.470588</td>\n",
       "      <td>9</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>273464</th>\n",
       "      <td>Sainte-Marie-La-Blanche</td>\n",
       "      <td>23</td>\n",
       "      <td>4</td>\n",
       "      <td>False</td>\n",
       "      <td>0.173913</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>288413</th>\n",
       "      <td>Happy Birthday!!!</td>\n",
       "      <td>17</td>\n",
       "      <td>2</td>\n",
       "      <td>False</td>\n",
       "      <td>0.117647</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>449021</th>\n",
       "      <td>LP+vi8jtiQZBFVJCXkyLRA==</td>\n",
       "      <td>24</td>\n",
       "      <td>13</td>\n",
       "      <td>False</td>\n",
       "      <td>0.541667</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>457244</th>\n",
       "      <td>_x8ROa2b5-swlg92a!</td>\n",
       "      <td>18</td>\n",
       "      <td>2</td>\n",
       "      <td>False</td>\n",
       "      <td>0.111111</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>473011</th>\n",
       "      <td>oO_KnightSpirit_Oo@yahoo.com</td>\n",
       "      <td>28</td>\n",
       "      <td>4</td>\n",
       "      <td>False</td>\n",
       "      <td>0.142857</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>516938</th>\n",
       "      <td>a%3A2%3A%7Bs%3A5%3A%22</td>\n",
       "      <td>22</td>\n",
       "      <td>5</td>\n",
       "      <td>False</td>\n",
       "      <td>0.227273</td>\n",
       "      <td>6</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>614704</th>\n",
       "      <td>$1$$i63gBKzKSznjzUiEWRZh0.</td>\n",
       "      <td>26</td>\n",
       "      <td>9</td>\n",
       "      <td>False</td>\n",
       "      <td>0.346154</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>661093</th>\n",
       "      <td>EAANt8NPs4oz7rDY_bz3pQHLg--~A</td>\n",
       "      <td>29</td>\n",
       "      <td>12</td>\n",
       "      <td>False</td>\n",
       "      <td>0.413793</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>936350</th>\n",
       "      <td>4UJ2-5NLF-HFFA-9JW3-X2KV</td>\n",
       "      <td>24</td>\n",
       "      <td>14</td>\n",
       "      <td>False</td>\n",
       "      <td>0.583333</td>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>989914</th>\n",
       "      <td>!Q@W#E$R%T^Y&amp;U*I(O)P_{+}</td>\n",
       "      <td>24</td>\n",
       "      <td>10</td>\n",
       "      <td>False</td>\n",
       "      <td>0.416667</td>\n",
       "      <td>12</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1024365</th>\n",
       "      <td>What_is_MD5_you_ask</td>\n",
       "      <td>19</td>\n",
       "      <td>3</td>\n",
       "      <td>False</td>\n",
       "      <td>0.157895</td>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1043249</th>\n",
       "      <td>xa%3A2%3A%7Bs%3A5%3A%22</td>\n",
       "      <td>23</td>\n",
       "      <td>5</td>\n",
       "      <td>False</td>\n",
       "      <td>0.217391</td>\n",
       "      <td>6</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                  text  length  num_caps  has_common_phrase  \\\n",
       "46929       $1$A$OsW5TjmIcgkbKylJ6NkWP      26        10              False   \n",
       "143866              IBM!_DEC@_SUN#_HP$      18        11              False   \n",
       "246681               !Q!W@E#R$T%Y^U&I*      17         8              False   \n",
       "273464         Sainte-Marie-La-Blanche      23         4              False   \n",
       "288413               Happy Birthday!!!      17         2              False   \n",
       "449021        LP+vi8jtiQZBFVJCXkyLRA==      24        13              False   \n",
       "457244              _x8ROa2b5-swlg92a!      18         2              False   \n",
       "473011    oO_KnightSpirit_Oo@yahoo.com      28         4              False   \n",
       "516938          a%3A2%3A%7Bs%3A5%3A%22      22         5              False   \n",
       "614704      $1$$i63gBKzKSznjzUiEWRZh0.      26         9              False   \n",
       "661093   EAANt8NPs4oz7rDY_bz3pQHLg--~A      29        12              False   \n",
       "936350        4UJ2-5NLF-HFFA-9JW3-X2KV      24        14              False   \n",
       "989914        !Q@W#E$R%T^Y&U*I(O)P_{+}      24        10              False   \n",
       "1024365            What_is_MD5_you_ask      19         3              False   \n",
       "1043249        xa%3A2%3A%7Bs%3A5%3A%22      23         5              False   \n",
       "\n",
       "         percent_caps  special_characters  \n",
       "46929        0.384615                   3  \n",
       "143866       0.611111                   7  \n",
       "246681       0.470588                   9  \n",
       "273464       0.173913                   3  \n",
       "288413       0.117647                   3  \n",
       "449021       0.541667                   3  \n",
       "457244       0.111111                   3  \n",
       "473011       0.142857                   3  \n",
       "516938       0.227273                   6  \n",
       "614704       0.346154                   3  \n",
       "661093       0.413793                   3  \n",
       "936350       0.583333                   4  \n",
       "989914       0.416667                  12  \n",
       "1024365      0.157895                   4  \n",
       "1043249      0.217391                   6  "
      ]
     },
     "execution_count": 68,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# list out the passwords\n",
    "betterpasswords"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 69,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-12-14T03:51:56.470284Z",
     "start_time": "2018-12-14T03:51:56.464089Z"
    }
   },
   "outputs": [],
   "source": [
    "# we will notice that even in our \"better passwords\" we have phrases like happy birthday or the name of an area in France\n",
    "# our later videos will focus on using machine learning to identify these types of phrases and help us filter them out as not so great passwords"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 138,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-12-14T04:04:19.696369Z",
     "start_time": "2018-12-14T04:04:19.688449Z"
    }
   },
   "outputs": [],
   "source": [
    "worst_passwords_2018 = [\n",
    "\"123456\",\n",
    "\"password\",\n",
    "\"123456789\",\n",
    "\"12345678\",\n",
    "\"12345\",\n",
    "\"111111\",\n",
    "\"1234567\",\n",
    "\"sunshine\",\n",
    "\"qwerty\",\n",
    "\"iloveyou\"\n",
    "]\n",
    "# Let's see how many people in this list have one of the worst 10 passwords"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 143,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-12-14T04:04:41.940676Z",
     "start_time": "2018-12-14T04:04:41.878628Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(34,)"
      ]
     },
     "execution_count": 143,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# we have some \"winners\"\n",
    "text[text.isin(worst_passwords_2018)].shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 71,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-12-14T03:51:57.623276Z",
     "start_time": "2018-12-14T03:51:56.486228Z"
    }
   },
   "outputs": [],
   "source": [
    "# we will first a CountVectorizer from sklearn's feature extraction module\n",
    "# the feature extraction module contains many tools built for extracting features from data. Previously\n",
    "# we manually extracted data by applying custom functions that we created such as num_caps, special_characters, etc\n",
    "# We are now entering automatic feature extraction territory\n",
    "\n",
    "# The CountVectorizer module specifically is built to quickly count occurences of phrases within pieces of text\n",
    "from sklearn.feature_extraction.text import CountVectorizer"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 72,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-12-14T03:52:07.391725Z",
     "start_time": "2018-12-14T03:51:57.630338Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<1048485x70 sparse matrix of type '<type 'numpy.int64'>'\n",
       "\twith 6935190 stored elements in Compressed Sparse Row format>"
      ]
     },
     "execution_count": 72,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# the steps to use a count vectorizer are usually standard\n",
    "# we will first instantiate an instance of a CountVectorizer with our parameters\n",
    "# There are about 2 dozen parameters to choose from, each with multiple options. For now let's look at two\n",
    "# analyzer=\"char\" will tell the module to count the characters in the string as opposed to whole words seperated by spaces\n",
    "# ngram_range tells the module how many characters to count in a row. In this case, we are only counting\n",
    "\n",
    "# ngram_range: tuple (min_n, max_n)\n",
    "# The lower and upper boundary of the range of n-values for \n",
    "# different n-grams to be extracted. All values of n such that min_n <= n <= max_n will be used.\n",
    "\n",
    "# character ranges from 1 to 1. Meaning we are counting single characters only\n",
    "one_cv = CountVectorizer(ngram_range=(1, 1), analyzer='char')\n",
    "\n",
    "# once we instantiate the module, we will call upon the fit_transform method to learn the vocabulary and then\n",
    "# transform our text series into a brand new matrix called one_char\n",
    "# Previously we created a matrix of quantitative data by applying our own functions, now we are creating numerical\n",
    "# matrices using sklearn\n",
    "one_char = one_cv.fit_transform(text)\n",
    "\n",
    "one_char\n",
    "\n",
    "# Note it is a sparse matrix\n",
    "\n",
    "# there are 70 unique chars (number of columns)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 73,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-12-14T03:52:07.418703Z",
     "start_time": "2018-12-14T03:52:07.397004Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{u'\\r': 0,\n",
       " u' ': 1,\n",
       " u'!': 2,\n",
       " u'\"': 3,\n",
       " u'#': 4,\n",
       " u'$': 5,\n",
       " u'%': 6,\n",
       " u'&': 7,\n",
       " u\"'\": 8,\n",
       " u'(': 9,\n",
       " u')': 10,\n",
       " u'*': 11,\n",
       " u'+': 12,\n",
       " u',': 13,\n",
       " u'-': 14,\n",
       " u'.': 15,\n",
       " u'/': 16,\n",
       " u'0': 17,\n",
       " u'1': 18,\n",
       " u'2': 19,\n",
       " u'3': 20,\n",
       " u'4': 21,\n",
       " u'5': 22,\n",
       " u'6': 23,\n",
       " u'7': 24,\n",
       " u'8': 25,\n",
       " u'9': 26,\n",
       " u':': 27,\n",
       " u';': 28,\n",
       " u'<': 29,\n",
       " u'=': 30,\n",
       " u'>': 31,\n",
       " u'?': 32,\n",
       " u'@': 33,\n",
       " u'[': 34,\n",
       " u'\\\\': 35,\n",
       " u']': 36,\n",
       " u'^': 37,\n",
       " u'_': 38,\n",
       " u'`': 39,\n",
       " u'a': 40,\n",
       " u'b': 41,\n",
       " u'c': 42,\n",
       " u'd': 43,\n",
       " u'e': 44,\n",
       " u'f': 45,\n",
       " u'g': 46,\n",
       " u'h': 47,\n",
       " u'i': 48,\n",
       " u'j': 49,\n",
       " u'k': 50,\n",
       " u'l': 51,\n",
       " u'm': 52,\n",
       " u'n': 53,\n",
       " u'o': 54,\n",
       " u'p': 55,\n",
       " u'q': 56,\n",
       " u'r': 57,\n",
       " u's': 58,\n",
       " u't': 59,\n",
       " u'u': 60,\n",
       " u'v': 61,\n",
       " u'w': 62,\n",
       " u'x': 63,\n",
       " u'y': 64,\n",
       " u'z': 65,\n",
       " u'{': 66,\n",
       " u'|': 67,\n",
       " u'}': 68,\n",
       " u'~': 69}"
      ]
     },
     "execution_count": 73,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# we can peak into the learned vocabulary of the CountVectorizer by calling the vocabulary_ attribute of the CV\n",
    "\n",
    "# the keys are the learned phrases while the values represent a unique index used by the CV to keep track of the vocab\n",
    "one_cv.vocabulary_\n",
    "\n",
    "# Note that is auto lowercases!"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 74,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-12-14T03:52:16.883294Z",
     "start_time": "2018-12-14T03:52:07.424761Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<1048485x96 sparse matrix of type '<type 'numpy.int64'>'\n",
       "\twith 6955519 stored elements in Compressed Sparse Row format>"
      ]
     },
     "execution_count": 74,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# now with lowercase=False, this way we will not force the lowercasing of characters\n",
    "one_cv = CountVectorizer(ngram_range=(1, 1), analyzer='char', lowercase=False)\n",
    "\n",
    "\n",
    "one_char = one_cv.fit_transform(text)\n",
    "\n",
    "one_char\n",
    "\n",
    "# there are now 96 unique chars (number of columns) ( 26 alphabet characters more :) )"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 75,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-12-14T03:52:16.911054Z",
     "start_time": "2018-12-14T03:52:16.889293Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{u'\\r': 0,\n",
       " u' ': 1,\n",
       " u'!': 2,\n",
       " u'\"': 3,\n",
       " u'#': 4,\n",
       " u'$': 5,\n",
       " u'%': 6,\n",
       " u'&': 7,\n",
       " u\"'\": 8,\n",
       " u'(': 9,\n",
       " u')': 10,\n",
       " u'*': 11,\n",
       " u'+': 12,\n",
       " u',': 13,\n",
       " u'-': 14,\n",
       " u'.': 15,\n",
       " u'/': 16,\n",
       " u'0': 17,\n",
       " u'1': 18,\n",
       " u'2': 19,\n",
       " u'3': 20,\n",
       " u'4': 21,\n",
       " u'5': 22,\n",
       " u'6': 23,\n",
       " u'7': 24,\n",
       " u'8': 25,\n",
       " u'9': 26,\n",
       " u':': 27,\n",
       " u';': 28,\n",
       " u'<': 29,\n",
       " u'=': 30,\n",
       " u'>': 31,\n",
       " u'?': 32,\n",
       " u'@': 33,\n",
       " u'A': 34,\n",
       " u'B': 35,\n",
       " u'C': 36,\n",
       " u'D': 37,\n",
       " u'E': 38,\n",
       " u'F': 39,\n",
       " u'G': 40,\n",
       " u'H': 41,\n",
       " u'I': 42,\n",
       " u'J': 43,\n",
       " u'K': 44,\n",
       " u'L': 45,\n",
       " u'M': 46,\n",
       " u'N': 47,\n",
       " u'O': 48,\n",
       " u'P': 49,\n",
       " u'Q': 50,\n",
       " u'R': 51,\n",
       " u'S': 52,\n",
       " u'T': 53,\n",
       " u'U': 54,\n",
       " u'V': 55,\n",
       " u'W': 56,\n",
       " u'X': 57,\n",
       " u'Y': 58,\n",
       " u'Z': 59,\n",
       " u'[': 60,\n",
       " u'\\\\': 61,\n",
       " u']': 62,\n",
       " u'^': 63,\n",
       " u'_': 64,\n",
       " u'`': 65,\n",
       " u'a': 66,\n",
       " u'b': 67,\n",
       " u'c': 68,\n",
       " u'd': 69,\n",
       " u'e': 70,\n",
       " u'f': 71,\n",
       " u'g': 72,\n",
       " u'h': 73,\n",
       " u'i': 74,\n",
       " u'j': 75,\n",
       " u'k': 76,\n",
       " u'l': 77,\n",
       " u'm': 78,\n",
       " u'n': 79,\n",
       " u'o': 80,\n",
       " u'p': 81,\n",
       " u'q': 82,\n",
       " u'r': 83,\n",
       " u's': 84,\n",
       " u't': 85,\n",
       " u'u': 86,\n",
       " u'v': 87,\n",
       " u'w': 88,\n",
       " u'x': 89,\n",
       " u'y': 90,\n",
       " u'z': 91,\n",
       " u'{': 92,\n",
       " u'|': 93,\n",
       " u'}': 94,\n",
       " u'~': 95}"
      ]
     },
     "execution_count": 75,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "one_cv.vocabulary_"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 76,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-12-14T03:52:16.951342Z",
     "start_time": "2018-12-14T03:52:16.916392Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th>!</th>\n",
       "      <th>\"</th>\n",
       "      <th>#</th>\n",
       "      <th>$</th>\n",
       "      <th>%</th>\n",
       "      <th>&amp;</th>\n",
       "      <th>'</th>\n",
       "      <th>(</th>\n",
       "      <th>...</th>\n",
       "      <th>u</th>\n",
       "      <th>v</th>\n",
       "      <th>w</th>\n",
       "      <th>x</th>\n",
       "      <th>y</th>\n",
       "      <th>z</th>\n",
       "      <th>{</th>\n",
       "      <th>|</th>\n",
       "      <th>}</th>\n",
       "      <th>~</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>1 rows × 96 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "   \\r     !  \"  #  $  %  &  '  ( ...  u  v  w  x  y  z  {  |  }  ~\n",
       "0   0  0  3  0  0  0  0  0  0  0 ...  0  0  1  0  1  0  0  0  0  0\n",
       "\n",
       "[1 rows x 96 columns]"
      ]
     },
     "execution_count": 76,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# transforming a new password\n",
    "pd.DataFrame(one_cv.transform(['qwerty123!!!']).toarray(), columns=one_cv.get_feature_names())\n",
    "\n",
    "# cannot learn new vocab. If we introduce a new character, wouldn't matter"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 77,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-12-14T03:52:16.990094Z",
     "start_time": "2018-12-14T03:52:16.957170Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "True\n",
      "True\n",
      "False\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th>!</th>\n",
       "      <th>\"</th>\n",
       "      <th>#</th>\n",
       "      <th>$</th>\n",
       "      <th>%</th>\n",
       "      <th>&amp;</th>\n",
       "      <th>'</th>\n",
       "      <th>(</th>\n",
       "      <th>...</th>\n",
       "      <th>u</th>\n",
       "      <th>v</th>\n",
       "      <th>w</th>\n",
       "      <th>x</th>\n",
       "      <th>y</th>\n",
       "      <th>z</th>\n",
       "      <th>{</th>\n",
       "      <th>|</th>\n",
       "      <th>}</th>\n",
       "      <th>~</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>1 rows × 96 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "   \\r     !  \"  #  $  %  &  '  ( ...  u  v  w  x  y  z  {  |  }  ~\n",
       "0   0  0  3  0  0  0  0  0  0  0 ...  0  0  1  0  1  0  0  0  0  0\n",
       "\n",
       "[1 rows x 96 columns]"
      ]
     },
     "execution_count": 77,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "print \"~\" in one_cv.vocabulary_\n",
    "\n",
    "print \"D\" in one_cv.vocabulary_\n",
    "\n",
    "print \"\\t\" in one_cv.vocabulary_\n",
    "\n",
    "\n",
    "# transforming a new password\n",
    "pd.DataFrame(one_cv.transform(['qw\\terty123!!!']).toarray(), columns=one_cv.get_feature_names())\n",
    "\n",
    "# cannot learn new vocab. If we introduce a new character, wouldn't matter"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 78,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-12-14T03:52:17.007374Z",
     "start_time": "2018-12-14T03:52:16.995794Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[ True,  True,  True,  True,  True,  True,  True,  True,  True,\n",
       "         True,  True,  True,  True,  True,  True,  True,  True,  True,\n",
       "         True,  True,  True,  True,  True,  True,  True,  True,  True,\n",
       "         True,  True,  True,  True,  True,  True,  True,  True,  True,\n",
       "         True,  True,  True,  True,  True,  True,  True,  True,  True,\n",
       "         True,  True,  True,  True,  True,  True,  True,  True,  True,\n",
       "         True,  True,  True,  True,  True,  True,  True,  True,  True,\n",
       "         True,  True,  True,  True,  True,  True,  True,  True,  True,\n",
       "         True,  True,  True,  True,  True,  True,  True,  True,  True,\n",
       "         True,  True,  True,  True,  True,  True,  True,  True,  True,\n",
       "         True,  True,  True,  True,  True,  True]])"
      ]
     },
     "execution_count": 78,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# all entries are the same\n",
    "\n",
    "one_cv.transform(['qw\\terty123!!!']).toarray() == one_cv.transform(['qwerty123!!!']).toarray()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 79,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-12-14T03:53:23.962940Z",
     "start_time": "2018-12-14T03:52:17.012770Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<1048485x2570934 sparse matrix of type '<type 'numpy.int64'>'\n",
       "\twith 31053193 stored elements in Compressed Sparse Row format>"
      ]
     },
     "execution_count": 79,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# now let's count all 1, 2, 3, 4, and 5 character phrases\n",
    "five_cv = CountVectorizer(ngram_range=(1, 5), analyzer='char')\n",
    "\n",
    "five_char = five_cv.fit_transform(text)\n",
    "\n",
    "five_char\n",
    "# there are 2,570,934 unique combo of up to 5-in-a-row-char phrases"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 80,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-12-14T03:53:24.171802Z",
     "start_time": "2018-12-14T03:53:23.967800Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{u'uer24': 2269299,\n",
       " u'uer23': 2269298,\n",
       " u'uer21': 2269297,\n",
       " u'uer20': 2269296,\n",
       " u'a4uz5': 640686,\n",
       " u'rotai': 2047903,\n",
       " u'hd20m': 1257873,\n",
       " u'i7n5': 1317982,\n",
       " u'fkhb8': 1146472,\n",
       " u'juy9f': 1460014,\n",
       " u'xodu': 2443742,\n",
       " u'xodt': 2443740,\n",
       " u'xodp': 2443738,\n",
       " u'xodz': 2443750,\n",
       " u'xody': 2443748,\n",
       " u'xodx': 2443746,\n",
       " u'xode': 2443720,\n",
       " u'xodd': 2443718,\n",
       " u'xodc': 2443716,\n",
       " u'xodb': 2443713,\n",
       " u'xoda': 2443711,\n",
       " u'xodo': 2443734,\n",
       " u'xodn': 2443732,\n",
       " u'xodk': 2443730,\n",
       " u'xodj': 2443729,\n",
       " u'xodi': 2443726,\n",
       " u'5faer': 391973,\n",
       " u'yhx35': 2487398,\n",
       " u'yhx36': 2487399,\n",
       " u'yhx38': 2487400,\n",
       " u'xod7': 2443709,\n",
       " u'bs796': 824988,\n",
       " u'xod2': 2443707,\n",
       " u'0nxh9': 91143,\n",
       " u'cbvk7': 858788,\n",
       " u'yps58': 2502597,\n",
       " u'bs791': 824987,\n",
       " u'voq4p': 2352095,\n",
       " u'd7qwe': 927199,\n",
       " u'fe48c': 1132976,\n",
       " u'talb0': 2182556,\n",
       " u'fe48w': 1132977,\n",
       " u'rubsl': 2062096,\n",
       " u'q\\rla': 1950010,\n",
       " u'q\\rlj': 1950012,\n",
       " u'ktgc9': 1520695,\n",
       " u'wf060': 2384164,\n",
       " u'wf061': 2384165,\n",
       " u'80273': 510089,\n",
       " u'talbl': 2182560,\n",
       " u'talbo': 2182561,\n",
       " u'talbi': 2182559,\n",
       " u'talbe': 2182558,\n",
       " u'talba': 2182557,\n",
       " u'kxm3p': 1527343,\n",
       " u'ythpi': 2509063,\n",
       " u'264fh': 200610,\n",
       " u'jpqu': 1451716,\n",
       " u'ythpd': 2509062,\n",
       " u'cluc2': 887663,\n",
       " u'1y6n': 170356,\n",
       " u'1y6j': 170354,\n",
       " u'gi80f': 1202135,\n",
       " u'ity@1': 1398287,\n",
       " u'dtg65': 983217,\n",
       " u'dtg66': 983218,\n",
       " u'282wn': 206761,\n",
       " u'e40ab': 1003955,\n",
       " u'324eb': 246855,\n",
       " u'248mj': 194333,\n",
       " u'cluc3': 887664,\n",
       " u'248mn': 194334,\n",
       " u'ewivu': 1101789,\n",
       " u'1m0ug': 159430,\n",
       " u'radz1': 1998361,\n",
       " u'ewiva': 1101788,\n",
       " u'324et': 246856,\n",
       " u'1y69': 170352,\n",
       " u'1y64': 170347,\n",
       " u'1y65': 170348,\n",
       " u'1y66': 170350,\n",
       " u'1y61': 170345,\n",
       " u'68ayj': 433085,\n",
       " u'68aye': 433084,\n",
       " u'v9zor': 2327832,\n",
       " u'exgq': 1104109,\n",
       " u'zie36': 2543679,\n",
       " u'zwyk4': 2563694,\n",
       " u'ef53_': 1034604,\n",
       " u'bo32r': 815408,\n",
       " u'om19p': 1839496,\n",
       " u'cllsy': 887174,\n",
       " u'om19t': 1839497,\n",
       " u'hd200': 1257870,\n",
       " u'pyije': 1948385,\n",
       " u'om19l': 1839495,\n",
       " u'bo32f': 815407,\n",
       " u'r3q0k': 1989314,\n",
       " u'cllss': 887173,\n",
       " u'17vk2': 137104,\n",
       " u'3l123': 283825,\n",
       " u'om199': 1839494,\n",
       " u'om198': 1839493,\n",
       " u'om193': 1839488,\n",
       " u'om192': 1839487,\n",
       " u'bo321': 815406,\n",
       " u'om190': 1839486,\n",
       " u'om197': 1839492,\n",
       " u'om196': 1839491,\n",
       " u'om195': 1839490,\n",
       " u'om194': 1839489,\n",
       " u'mjn92': 1663095,\n",
       " u'4lpha': 340591,\n",
       " u'uk58r': 2280606,\n",
       " u'w3s0m': 2366913,\n",
       " u'aradr': 728521,\n",
       " u'5262s': 361793,\n",
       " u'vlt8u': 2348980,\n",
       " u'8937z': 538627,\n",
       " u'tjmme': 2210217,\n",
       " u'djoka': 962186,\n",
       " u'85179': 524998,\n",
       " u'aradz': 728525,\n",
       " u'xj205': 2437981,\n",
       " u'89581': 538992,\n",
       " u'lezjc': 1564593,\n",
       " u'lezja': 1564592,\n",
       " u'ltxwd': 1605647,\n",
       " u'ltxwy': 1605648,\n",
       " u'd7cfu': 926837,\n",
       " u'zff80': 2538602,\n",
       " u'djoko': 962188,\n",
       " u'zff88': 2538603,\n",
       " u'3obhj': 287361,\n",
       " u'uyola': 2318652,\n",
       " u'qix4z': 1965824,\n",
       " u'9qam': 610122,\n",
       " u'9qal': 610120,\n",
       " u'9qan': 610124,\n",
       " u'arado': 728520,\n",
       " u'9qae': 610117,\n",
       " u'9qag': 610118,\n",
       " u'9qay': 610129,\n",
       " u'9qaz': 610130,\n",
       " u'widrd': 2388915,\n",
       " u'9qas': 610126,\n",
       " u'sklt0': 2124481,\n",
       " u'9qaw': 610128,\n",
       " u'8aqmx': 542440,\n",
       " u'uyol1': 2318651,\n",
       " u'ssseb': 2146350,\n",
       " u'mj173': 1662151,\n",
       " u'4diya': 333314,\n",
       " u'mj171': 1662150,\n",
       " u'czsga': 914296,\n",
       " u'fh6b2': 1140396,\n",
       " u'9qa3': 610113,\n",
       " u'9qa7': 610115,\n",
       " u'be916': 794581,\n",
       " u't_csv': 2179436,\n",
       " u'be910': 794579,\n",
       " u'be911': 794580,\n",
       " u'a079g': 631665,\n",
       " u'a079b': 631664,\n",
       " u'be919': 794582,\n",
       " u'rby2s': 2006039,\n",
       " u'ndia9': 1726773,\n",
       " u'ndia5': 1726770,\n",
       " u'ndia7': 1726772,\n",
       " u'ndia6': 1726771,\n",
       " u'ndia1': 1726767,\n",
       " u'ndia0': 1726766,\n",
       " u'ndia3': 1726769,\n",
       " u'ndia2': 1726768,\n",
       " u'p8305': 1898602,\n",
       " u'f3dur': 1116480,\n",
       " u'g4e07': 1175341,\n",
       " u'gmplw': 1211363,\n",
       " u'vee67': 2336123,\n",
       " u'eqju': 1075107,\n",
       " u'wdxfw': 2380231,\n",
       " u'ykczh': 2491855,\n",
       " u'xd0io': 2429375,\n",
       " u'ndia@': 1726774,\n",
       " u'rby22': 2006036,\n",
       " u'rby23': 2006037,\n",
       " u'rby20': 2006034,\n",
       " u'rby21': 2006035,\n",
       " u'ndiay': 1726793,\n",
       " u'ndiax': 1726792,\n",
       " u'rby25': 2006038,\n",
       " u'xsy16': 2448728,\n",
       " u'ndiaw': 1726791,\n",
       " u'xsy15': 2448727,\n",
       " u'xsy12': 2448726,\n",
       " u'ndiap': 1726788,\n",
       " u'ndias': 1726790,\n",
       " u'ndiar': 1726789,\n",
       " u'an565': 708872,\n",
       " u'ndial': 1726784,\n",
       " u'an567': 708874,\n",
       " u'an566': 708873,\n",
       " u'an561': 708868,\n",
       " u'an560': 708867,\n",
       " u'an563': 708870,\n",
       " u'ndiaj': 1726783,\n",
       " u'ndiae': 1726779,\n",
       " u'ndiad': 1726778,\n",
       " u'ndiag': 1726781,\n",
       " u'ndiaf': 1726780,\n",
       " u'an569': 708876,\n",
       " u'an568': 708875,\n",
       " u'ndiac': 1726777,\n",
       " u'ndiab': 1726776,\n",
       " u'moo9r': 1675060,\n",
       " u'azpw': 765940,\n",
       " u'gpgkl': 1217917,\n",
       " u'bmcdx': 812726,\n",
       " u'nzta': 1792585,\n",
       " u'ke yo': 1486897,\n",
       " u'7mpel': 499489,\n",
       " u'moo93': 1675058,\n",
       " u'o28ve': 1797308,\n",
       " u'moo99': 1675059,\n",
       " u'azpx': 765943,\n",
       " u'z3431': 2523273,\n",
       " u'azpe': 765916,\n",
       " u'eqjh': 1075100,\n",
       " u'4fsg9': 335869,\n",
       " u'gjeqc': 1205578,\n",
       " u'.stil': 34835,\n",
       " u'.stic': 34834,\n",
       " u'yfsux': 2484073,\n",
       " u's%6': 2073909,\n",
       " u's%5': 2073906,\n",
       " u's%3': 2073903,\n",
       " u's%2': 2073899,\n",
       " u's%9': 2073912,\n",
       " u'qho8q': 1964102,\n",
       " u'cokay': 892910,\n",
       " u'nztt': 1792604,\n",
       " u'jnkb': 1446820,\n",
       " u'dcqfx': 940198,\n",
       " u'cokai': 892907,\n",
       " u'lfjst': 1565961,\n",
       " u'cokan': 892909,\n",
       " u'cokam': 892908,\n",
       " u'dcqfk': 940197,\n",
       " u'cokab': 892905,\n",
       " u'9az54': 597364,\n",
       " u's%w': 2073938,\n",
       " u's%v': 2073935,\n",
       " u's%t': 2073932,\n",
       " u'wpciy': 2400802,\n",
       " u'10jwr': 106120,\n",
       " u's%g': 2073923,\n",
       " u'sroma': 2141935,\n",
       " u's%d': 2073920,\n",
       " u's%c': 2073917,\n",
       " u's%m': 2073929,\n",
       " u's%i': 2073926,\n",
       " u'coka9': 892904,\n",
       " u'coka8': 892903,\n",
       " u'coka1': 892899,\n",
       " u'coka7': 892902,\n",
       " u'coka6': 892901,\n",
       " u'coka5': 892900,\n",
       " u'9az5z': 597365,\n",
       " u's%@': 2073915,\n",
       " u'u1nu2': 2249015,\n",
       " u'qt\\rl': 1974774,\n",
       " u'ygsdc': 2485486,\n",
       " u'24lhj': 195165,\n",
       " u'o8521': 1802646,\n",
       " u'o8520': 1802645,\n",
       " u'o8522': 1802647,\n",
       " u'o8524': 1802648,\n",
       " u'o8527': 1802649,\n",
       " u'ygsdx': 2485487,\n",
       " u'003ye': 39506,\n",
       " u'003yl': 39507,\n",
       " u'003yn': 39508,\n",
       " u'003yu': 39509,\n",
       " u'@8603': 620375,\n",
       " u'ck90': 882349,\n",
       " u'jf012': 1431412,\n",
       " u'0e50': 82333,\n",
       " u'jf011': 1431411,\n",
       " u'jpzav': 1451982,\n",
       " u'0e51': 82335,\n",
       " u' birt': 11253,\n",
       " u'yttri': 2509757,\n",
       " u'ck94': 882366,\n",
       " u'lqqk5': 1599088,\n",
       " u'0e54': 82339,\n",
       " u'o852w': 1802650,\n",
       " u'al717': 694677,\n",
       " u'9cutl': 599631,\n",
       " u'ykuss': 2492610,\n",
       " u'9cute': 599630,\n",
       " u'viqe8': 2345675,\n",
       " u'pulux': 1943369,\n",
       " u'hfx3u': 1266634,\n",
       " u'puluh': 1943366,\n",
       " u'pulum': 1943367,\n",
       " u'pulun': 1943368,\n",
       " u'9ja': 605080,\n",
       " u'9jb': 605131,\n",
       " u'9jc': 605145,\n",
       " u'9jd': 605159,\n",
       " u'9je': 605165,\n",
       " u'9jf': 605201,\n",
       " u'9jg': 605219,\n",
       " u'9jh': 605236,\n",
       " u'9ji': 605256,\n",
       " u'9jj': 605285,\n",
       " u'9jk': 605300,\n",
       " u'9jl': 605321,\n",
       " u'9jm': 605337,\n",
       " u'9jn': 605361,\n",
       " u'9jo': 605380,\n",
       " u'9jp': 605418,\n",
       " u'9jq': 605424,\n",
       " u'9jr': 605435,\n",
       " u'9js': 605451,\n",
       " u'9jt': 605477,\n",
       " u'9ju': 605490,\n",
       " u'9jv': 605531,\n",
       " u'9jw': 605546,\n",
       " u'16pia': 133466,\n",
       " u'9jy': 605569,\n",
       " u'9jz': 605593,\n",
       " u'4h6se': 337054,\n",
       " u'rgizi': 2022569,\n",
       " u'jeylo': 1431298,\n",
       " u'273ls': 203654,\n",
       " u'5df3i': 390112,\n",
       " u'wd0el': 2378862,\n",
       " u'vi<3': 2343021,\n",
       " u'q36p6': 1952945,\n",
       " u'9j.': 604927,\n",
       " u'r\\rfro': 1982039,\n",
       " u'9j0': 604930,\n",
       " u'9j1': 604954,\n",
       " u'9j2': 604963,\n",
       " u'9j3': 604979,\n",
       " u'9j4': 604987,\n",
       " u'9j5': 605001,\n",
       " u'9j6': 605018,\n",
       " u'9j7': 605028,\n",
       " u'9j8': 605044,\n",
       " u'9j9': 605059,\n",
       " u'c3.14': 842479,\n",
       " u'k55*a': 1472654,\n",
       " u' emo': 11620,\n",
       " u'gdou7': 1190137,\n",
       " u'vezal': 2339926,\n",
       " u'mjjyl': 1662938,\n",
       " u'jud.2': 1457978,\n",
       " u'gdou3': 1190136,\n",
       " u'gdou0': 1190134,\n",
       " u'$050': 18241,\n",
       " u'gdou9': 1190138,\n",
       " u'38569': 266532,\n",
       " u'38568': 266531,\n",
       " u'kphuo': 1514584,\n",
       " u'38561': 266524,\n",
       " u'38560': 266523,\n",
       " u'38563': 266526,\n",
       " u'38562': 266525,\n",
       " u'38565': 266528,\n",
       " u'38564': 266527,\n",
       " u'38567': 266530,\n",
       " u'38566': 266529,\n",
       " u'4l91i': 340076,\n",
       " u'3856/': 266522,\n",
       " u'2cwek': 216807,\n",
       " u'gdoug': 1190141,\n",
       " u'\\rjapa': 6267,\n",
       " u'hyybb': 1307140,\n",
       " u'gdoub': 1190139,\n",
       " u'hyybi': 1307141,\n",
       " u'4b8cc': 330266,\n",
       " u'gdout': 1190144,\n",
       " u'gdous': 1190143,\n",
       " u'13892': 121065,\n",
       " u'rovka': 2048437,\n",
       " u'djvul': 962595,\n",
       " u'lfxjr': 1566649,\n",
       " u'cetor': 867566,\n",
       " u'iecsj': 1338340,\n",
       " u'cetot': 867567,\n",
       " u'cetoh': 867562,\n",
       " u'cetok': 867563,\n",
       " u'cetom': 867564,\n",
       " u'ceton': 867565,\n",
       " u'3856a': 266533,\n",
       " u'cetoa': 867557,\n",
       " u'cetob': 867558,\n",
       " u'cetoc': 867559,\n",
       " u'cetoe': 867560,\n",
       " u'cetof': 867561,\n",
       " u'01ud2': 47800,\n",
       " u'0p7o1': 92005,\n",
       " u'pfyx3': 1914443,\n",
       " u'd1u1': 918932,\n",
       " u'1w9b': 169115,\n",
       " u'1w9a': 169112,\n",
       " u'brayn': 822799,\n",
       " u'brayl': 822798,\n",
       " u'g6334': 1177052,\n",
       " u'g6338': 1177053,\n",
       " u'brayd': 822796,\n",
       " u'brayb': 822795,\n",
       " u'braya': 822794,\n",
       " u'brayw': 822802,\n",
       " u'brayt': 822801,\n",
       " u'brays': 822800,\n",
       " u'd1uc': 918934,\n",
       " u'hboqu': 1255690,\n",
       " u'f.hap': 1111090,\n",
       " u'd1ui': 918938,\n",
       " u'd1um': 918940,\n",
       " u'd1us': 918944,\n",
       " u'd1up': 918943,\n",
       " u'd1uu': 918946,\n",
       " u'pfyxg': 1914444,\n",
       " u'd1ux': 918948,\n",
       " u'eejt': 1031618,\n",
       " u'traku': 2225966,\n",
       " u'fjm9r': 1145387,\n",
       " u'bray8': 822793,\n",
       " u'bray6': 822792,\n",
       " u'bray3': 822791,\n",
       " u'bray2': 822790,\n",
       " u'bray1': 822789,\n",
       " u'bray0': 822788,\n",
       " u'o.134': 1793866,\n",
       " u'174us': 135017,\n",
       " u'0x5g': 98464,\n",
       " u'kxwbn': 1527551,\n",
       " u'0x5m': 98466,\n",
       " u'vwxv4': 2360020,\n",
       " u'5sias': 401393,\n",
       " u'kksou': 1503871,\n",
       " u'49a44': 327515,\n",
       " u'275\\ra': 203857,\n",
       " u'04_61': 57728,\n",
       " u'113um': 108951,\n",
       " u'uya15': 2317789,\n",
       " u'0x53': 98459,\n",
       " u'0x51': 98457,\n",
       " u'0x50': 98455,\n",
       " u'0x55': 98461,\n",
       " u'0x58': 98462,\n",
       " u'n5z9l': 1707542,\n",
       " u'49a4e': 327516,\n",
       " u'apbil': 722898,\n",
       " u'apbin': 722899,\n",
       " u'-kiko': 29183,\n",
       " u'!nico': 15302,\n",
       " u'c8sub': 850010,\n",
       " u'vbswy': 2332402,\n",
       " u'qkxeq': 1967402,\n",
       " u'28jmo': 208650,\n",
       " u'1je9q': 156814,\n",
       " u'dhs11': 955327,\n",
       " u'dhs12': 955328,\n",
       " u'apbi2': 722897,\n",
       " u'dmaen': 966606,\n",
       " u'vy3de': 2360843,\n",
       " u'og8st': 1821650,\n",
       " u'88lfl': 536946,\n",
       " u'eejm': 1031605,\n",
       " u'dmaes': 966607,\n",
       " u'wp7du': 2400636,\n",
       " u'248ma': 194332,\n",
       " u'uf0': 2270128,\n",
       " u'3f7g': 278818,\n",
       " u'gcp56': 1188510,\n",
       " u'trakk': 2225960,\n",
       " u'cnkcn': 890538,\n",
       " u'51924': 358686,\n",
       " u'256h3': 197438,\n",
       " u'afm': 673377,\n",
       " u'mfk42': 1653119,\n",
       " u'rsue1': 2056919,\n",
       " u'zrf87': 2556653,\n",
       " u'lraid': 1599672,\n",
       " u'suwoa': 2156427,\n",
       " u'lrain': 1599674,\n",
       " u'lraih': 1599673,\n",
       " u'lrait': 1599677,\n",
       " u'lrais': 1599676,\n",
       " u'lrair': 1599675,\n",
       " u'h1jis': 1238799,\n",
       " u'rsuef': 2056920,\n",
       " u'51928': 358690,\n",
       " u't9k': 2178830,\n",
       " u'rsuep': 2056921,\n",
       " u'arbha': 729046,\n",
       " u's45dj': 2081109,\n",
       " u'een89': 1032360,\n",
       " u'een88': 1032359,\n",
       " u'a.h.': 630720,\n",
       " u'een85': 1032356,\n",
       " u'een84': 1032355,\n",
       " u'hmpak': 1281686,\n",
       " u'een86': 1032357,\n",
       " u'een81': 1032352,\n",
       " u'een83': 1032354,\n",
       " u'een82': 1032353,\n",
       " u'hmpar': 1281687,\n",
       " u'een8*': 1032351,\n",
       " u'5135p': 357428,\n",
       " u'mezis': 1652289,\n",
       " u'macht': 1633425,\n",
       " u'machu': 1633426,\n",
       " u'machv': 1633427,\n",
       " u'machr': 1633423,\n",
       " u'machs': 1633424,\n",
       " u'..abc': 30804,\n",
       " u'mache': 1633416,\n",
       " u'xos35': 2444276,\n",
       " u'macha': 1633415,\n",
       " u'machl': 1633419,\n",
       " u'machm': 1633420,\n",
       " u'machn': 1633421,\n",
       " u'macho': 1633422,\n",
       " u'machh': 1633417,\n",
       " u'machi': 1633418,\n",
       " u'm@we': 1631678,\n",
       " u'a.ha': 630722,\n",
       " u'f u w': 1110779,\n",
       " u'4us9i': 348800,\n",
       " u'een8j': 1032361,\n",
       " u'a.hu': 630724,\n",
       " u'mach4': 1633412,\n",
       " u'mach7': 1633413,\n",
       " u'mach0': 1633409,\n",
       " u'mach1': 1633410,\n",
       " u'mach2': 1633411,\n",
       " u'mach8': 1633414,\n",
       " u'u20m-': 2249242,\n",
       " u'xos3w': 2444277,\n",
       " u'c*=1b': 837420,\n",
       " u'nds63': 1728316,\n",
       " u'nds61': 1728315,\n",
       " u'hnoe': 1283516,\n",
       " u'89a01': 539904,\n",
       " u'h\\redu': 1235881,\n",
       " u'qzhy5': 1981447,\n",
       " u'ebos6': 1020282,\n",
       " u'9n9gy': 608081,\n",
       " u'ebos2': 1020281,\n",
       " u'qzhy9': 1981448,\n",
       " u'ufb': 2270457,\n",
       " u'2rm6d': 230074,\n",
       " u'aff': 672622,\n",
       " u'ufg': 2270938,\n",
       " u'hnok': 1283531,\n",
       " u'raq86': 2001812,\n",
       " u'ebosc': 1020283,\n",
       " u'ebosm': 1020285,\n",
       " u'ebosh': 1020284,\n",
       " u'1dusc': 151667,\n",
       " u'ebost': 1020288,\n",
       " u'ebosp': 1020286,\n",
       " u'eboss': 1020287,\n",
       " u'1dust': 151669,\n",
       " u'9n9g3': 608079,\n",
       " u'ebosz': 1020289,\n",
       " u'cdmam': 863285,\n",
       " u'v1541': 2322084,\n",
       " u'v1542': 2322085,\n",
       " u'7q4r2': 501604,\n",
       " u'v1549': 2322086,\n",
       " u'u$lai': 2246953,\n",
       " u'cirpg': 879152,\n",
       " u'k3p\\rm': 1471086,\n",
       " u'th432': 2200404,\n",
       " u'cirpn': 879153,\n",
       " u'th439': 2200405,\n",
       " u'05g6k': 61551,\n",
       " u'tc001': 2187653,\n",
       " u'rhiss': 2024317,\n",
       " u'eqzxu': 1075661,\n",
       " u'i4g8h': 1315198,\n",
       " u'o7qgk': 1802247,\n",
       " u'hegn2': 1261455,\n",
       " u'sona2': 2135137,\n",
       " u'nlg13': 1758653,\n",
       " u'th43y': 2200406,\n",
       " u'3maag': 285158,\n",
       " u'rhis9': 2024316,\n",
       " u'cup0t': 906865,\n",
       " u'3maal': 285159,\n",
       " u'aokhe': 720338,\n",
       " u'wjxf8': 2392505,\n",
       " u'3maaz': 285160,\n",
       " u'aokhu': 720340,\n",
       " u'ishga': 1390376,\n",
       " u'yq0ty': 2502980,\n",
       " u'ishgg': 1390377,\n",
       " u'ishgi': 1390378,\n",
       " u'ishgo': 1390379,\n",
       " u'ishgu': 1390380,\n",
       " u'kjca3': 1501136,\n",
       " u'fyp92': 1167260,\n",
       " u'in86k': 1368237,\n",
       " u'abeej': 655163,\n",
       " u'hnol': 1283535,\n",
       " u'uf]': 2270318,\n",
       " u'ufcok': 2270515,\n",
       " u'0n4dm': 90258,\n",
       " u'7u723': 504553,\n",
       " u'19gc8': 144484,\n",
       " u'ua9zs': 2256653,\n",
       " u'ishg0': 1390375,\n",
       " u'ufcol': 2270516,\n",
       " u'8\\rhs': 508554,\n",
       " u'8\\rhu': 508557,\n",
       " u'hnor': 1283557,\n",
       " u'8\\rha': 508543,\n",
       " u'8\\rhe': 508548,\n",
       " u'twwtr': 2240681,\n",
       " u'4lph4': 340590,\n",
       " u'8\\rho': 508551,\n",
       " u'had65': 1248793,\n",
       " u'had66': 1248794,\n",
       " u'u32hp': 2250346,\n",
       " u'had63': 1248792,\n",
       " u'dnagy': 968642,\n",
       " u'had69': 1248795,\n",
       " u'jezvp': 1431377,\n",
       " u'58d85': 381925,\n",
       " u'jezvs': 1431378,\n",
       " u'owh33': 1882771,\n",
       " u'dnage': 968641,\n",
       " u'q3tvy': 1953340,\n",
       " u'xneb5': 2442918,\n",
       " u'hnop': 1283551,\n",
       " u'smz36': 2130806,\n",
       " u'mxwdk': 1692581,\n",
       " u'9y5ji': 615182,\n",
       " u'hnow': 1283582,\n",
       " u'sycmc': 2162341,\n",
       " u'tr:27': 2225683,\n",
       " u'i79pe': 1317804,\n",
       " u'yfwbj': 2484186,\n",
       " u'9y5jv': 615183,\n",
       " u'sonam': 2135152,\n",
       " u'nic@1': 1748302,\n",
       " u'ydwar': 2480129,\n",
       " u'774qi': 479090,\n",
       " u's0045': 2074672,\n",
       " u'armbi': 732522,\n",
       " u'ajo96': 688849,\n",
       " u'ajo97': 688850,\n",
       " u'ajo94': 688848,\n",
       " u'gacra': 1182275,\n",
       " u'3176x': 243884,\n",
       " u'abc&z': 654570,\n",
       " u'456^^': 315036,\n",
       " u'r84ph': 1993983,\n",
       " u't67hh': 2175780,\n",
       " u'gacrp': 1182276,\n",
       " u'-life': 29295,\n",
       " u't67ha': 2175779,\n",
       " u'-jump': 29125,\n",
       " u'y9k3n': 2470266,\n",
       " u'0069w': 40196,\n",
       " u'3176\\r': 243873,\n",
       " u'2mc49': 225850,\n",
       " u'ojk4j': 1829517,\n",
       " u'31764': 243878,\n",
       " u'31765': 243879,\n",
       " u'31766': 243880,\n",
       " u'31767': 243881,\n",
       " u'31760': 243874,\n",
       " u'31761': 243875,\n",
       " u'31762': 243876,\n",
       " u'31763': 243877,\n",
       " u'uya12': 2317788,\n",
       " u'a1.co': 632531,\n",
       " u'31768': 243882,\n",
       " u'31769': 243883,\n",
       " u'abc&9': 654569,\n",
       " u'gacr5': 1182274,\n",
       " u'frbme': 1156527,\n",
       " u'fpjl9': 1154545,\n",
       " u'23ao': 189738,\n",
       " u'23al': 189709,\n",
       " u'23am': 189719,\n",
       " u'so 19': 2132889,\n",
       " u'l2o5v': 1535798,\n",
       " u'ymppe': 2496441,\n",
       " u'rryb1': 2053938,\n",
       " u'45765': 315611,\n",
       " u'rryb8': 2053939,\n",
       " u'23ai': 189700,\n",
       " u'2mrga': 226286,\n",
       " u'ymppw': 2496442,\n",
       " u'f5sqy': 1119158,\n",
       " u'armbr': 732524,\n",
       " u'23ad': 189679,\n",
       " u's9yke': 2087134,\n",
       " u'45768': 315614,\n",
       " u'g12sh': 1171334,\n",
       " u'qz179': 1981125,\n",
       " u'(thin': 22580,\n",
       " u'23ac': 189671,\n",
       " u'hnfgu': 1282994,\n",
       " u'rrybr': 2053946,\n",
       " u'rrybu': 2053947,\n",
       " u'rrybb': 2053941,\n",
       " u'rryba': 2053940,\n",
       " u'rrybe': 2053942,\n",
       " u'0ak17': 77660,\n",
       " u'rrybi': 2053943,\n",
       " u'rrybo': 2053945,\n",
       " u'rrybl': 2053944,\n",
       " u'yctwc': 2478224,\n",
       " u'!voy': 15748,\n",
       " u'23aw': 189783,\n",
       " u'1fb83': 153374,\n",
       " u'js82i': 1454429,\n",
       " u'j9653': 1418578,\n",
       " u'q24ue': 1951999,\n",
       " u'l3112': 1536070,\n",
       " u'ct.01': 903388,\n",
       " u'iagza': 1322521,\n",
       " u'0unmz': 96816,\n",
       " u'chfsk': 873455,\n",
       " u'js824': 1454427,\n",
       " u'js825': 1454428,\n",
       " u'anshy': 716379,\n",
       " u'9ec4': 601268,\n",
       " u'malbl': 1635727,\n",
       " u'anshu': 716378,\n",
       " u'anshi': 716375,\n",
       " u'ig63': 1344562,\n",
       " u'ig60': 1344555,\n",
       " u'anshj': 716376,\n",
       " u'ig66': 1344564,\n",
       " u'ig67': 1344566,\n",
       " u'ansho': 716377,\n",
       " u'ig65': 1344563,\n",
       " u'ansha': 716372,\n",
       " u'r1492': 1985306,\n",
       " u'ig68': 1344568,\n",
       " u'ig69': 1344574,\n",
       " u'anshe': 716374,\n",
       " u'anshd': 716373,\n",
       " u'r1495': 1985308,\n",
       " u'9lyey': 607060,\n",
       " u'wsc99': 2404489,\n",
       " u'dyddu': 991859,\n",
       " u'malbe': 1635724,\n",
       " u'a\\r490': 629074,\n",
       " u'a\\r492': 629075,\n",
       " u'qvkle': 1977722,\n",
       " u'ig6a': 1344576,\n",
       " u'y2a2h': 2462928,\n",
       " u'ig6p': 1344577,\n",
       " u'ig6q': 1344579,\n",
       " u'gn6dm': 1211915,\n",
       " u'f9c1': 1123228,\n",
       " u's0104': 2074747,\n",
       " u'wsc96': 2404488,\n",
       " u's0101': 2074744,\n",
       " u'58xc': 382649,\n",
       " u'58xb': 382648,\n",
       " u'eu_': 1096225,\n",
       " u'di_s': 956322,\n",
       " u'58xk': 382653,\n",
       " u'58xi': 382651,\n",
       " u'58xm': 382655,\n",
       " u'58xs': 382660,\n",
       " u'\\rb3a': 3804,\n",
       " u'58xp': 382658,\n",
       " u'di_a': 956318,\n",
       " u'58xt': 382662,\n",
       " u'\\rb3h': 3806,\n",
       " u'58xy': 382665,\n",
       " u'58xx': 382664,\n",
       " u'\\rb3m': 3808,\n",
       " u'egry8': 1039374,\n",
       " u'iauka': 1325601,\n",
       " u'g17*!': 1171680,\n",
       " u'iauks': 1325602,\n",
       " u'bmobi': 813221,\n",
       " u'7jeep': 497010,\n",
       " u'u404s': 2251069,\n",
       " u'di_1': 956308,\n",
       " u'di_2': 956310,\n",
       " u'58x$': 382642,\n",
       " u'\\rb38': 3802,\n",
       " u'di_8': 956314,\n",
       " u'18oma': 140371,\n",
       " u'jjsyk': 1440617,\n",
       " u'a9bc': 648193,\n",
       " u'58x7': 382646,\n",
       " u'qzl9g': 1981517,\n",
       " u'ov*12': 1878992,\n",
       " u'a9bd': 648197,\n",
       " u'ey772': 1106275,\n",
       " u'a9be': 648199,\n",
       " u'mp0x1': 1676950,\n",
       " u'1p71o': 162835,\n",
       " u'egrya': 1039375,\n",
       " u'npb36': 1770060,\n",
       " u'ons32': 1849391,\n",
       " u'ons33': 1849392,\n",
       " u'ons30': 1849390,\n",
       " u'u76m': 2253649,\n",
       " u'u76j': 2253646,\n",
       " u'u76k': 2253648,\n",
       " u'ons34': 1849393,\n",
       " u'ons35': 1849394,\n",
       " u'u76f': 2253642,\n",
       " u'u76g': 2253644,\n",
       " u'ons38': 1849395,\n",
       " u'vp6ea': 2352839,\n",
       " u'nhv8': 1746663,\n",
       " u'hweps': 1302976,\n",
       " u'u76z': 2253654,\n",
       " u'u76y': 2253652,\n",
       " u'jil68': 1437581,\n",
       " u'fith1': 1144173,\n",
       " u'ntent': 1778659,\n",
       " u'ntenw': 1778660,\n",
       " u'ntenp': 1778656,\n",
       " u'706\\r)': 459525,\n",
       " u'ntens': 1778658,\n",
       " u'ntenl': 1778652,\n",
       " u'ntenm': 1778653,\n",
       " u'ntenn': 1778654,\n",
       " u'nteno': 1778655,\n",
       " u'ntenh': 1778649,\n",
       " u'nteni': 1778650,\n",
       " u'706\\r1': 459526,\n",
       " u'ntenk': 1778651,\n",
       " u'ntend': 1778646,\n",
       " u'ntene': 1778647,\n",
       " u'nteng': 1778648,\n",
       " u'ntena': 1778644,\n",
       " u'ntenc': 1778645,\n",
       " u'sbiqn': 2094850,\n",
       " u'nhvv': 1746692,\n",
       " u'o603': 1800752,\n",
       " u'nhvp': 1746686,\n",
       " u'nhvq': 1746689,\n",
       " u'nhvs': 1746690,\n",
       " u'h0651': 1236947,\n",
       " u'h0652': 1236948,\n",
       " u'a9bw': 648203,\n",
       " u'nhvz': 1746694,\n",
       " u'mgji1': 1654493,\n",
       " u'nhvf': 1746673,\n",
       " u'23a\\r': 189614,\n",
       " u'nhva': 1746667,\n",
       " u'u768': 2253638,\n",
       " u'nhvc': 1746670,\n",
       " u'u766': 2253632,\n",
       " u'u767': 2253636,\n",
       " u'u764': 2253627,\n",
       " u'u765': 2253628,\n",
       " u'nhvh': 1746675,\n",
       " u'nhvi': 1746677,\n",
       " u'u760': 2253617,\n",
       " u'u761': 2253622,\n",
       " u'fithr': 1144176,\n",
       " u'fiths': 1144177,\n",
       " u'706\\rc': 459529,\n",
       " u'nten9': 1778643,\n",
       " u'kmcco': 1507192,\n",
       " u'nten1': 1778641,\n",
       " u'nten2': 1778642,\n",
       " u'706\\rh': 459530,\n",
       " u'nhv@': 1746665,\n",
       " u'fithe': 1144174,\n",
       " u'fithm': 1144175,\n",
       " u'9ecs': 601297,\n",
       " u'waggl': 2373179,\n",
       " u'sxkk8': 2160900,\n",
       " u'jt791': 1456270,\n",
       " u'ihtyq': 1349227,\n",
       " u'sxkk3': 2160899,\n",
       " u'mkhhq': 1664351,\n",
       " u'ihtym': 1349226,\n",
       " u'ihtyj': 1349225,\n",
       " u'17lkr': 136696,\n",
       " u'44vei': 313260,\n",
       " u'44ven': 313261,\n",
       " u'44vea': 313259,\n",
       " u'76nsp': 477757,\n",
       " u'd1uud': 918947,\n",
       " u'tadkj': 2181097,\n",
       " u'tadko': 2181099,\n",
       " u'gswol': 1223874,\n",
       " u'gswoo': 1223875,\n",
       " u'9eca': 601278,\n",
       " u'gswor': 1223876,\n",
       " u'zzzor': 2570277,\n",
       " u'zcaz7': 2533117,\n",
       " u'zcaz1': 2533116,\n",
       " u'zcaz0': 2533115,\n",
       " u'3cvod': 275695,\n",
       " u'zzzok': 2570276,\n",
       " u'e@44': 1012571,\n",
       " u'e@42': 1012569,\n",
       " u'e@40': 1012567,\n",
       " u'r33y0': 1988426,\n",
       " u'to892': 2218207,\n",
       " u'a1rfa': 634829,\n",
       " u'1ep83': 152646,\n",
       " u'zcazc': 2533118,\n",
       " u'i.lan': 1309751,\n",
       " u'92jk7': 572789,\n",
       " u'r@123': 1995712,\n",
       " u'15nug': 129994,\n",
       " u'15nul': 129995,\n",
       " u'iyohp': 1407385,\n",
       " u'g4a1': 1175277,\n",
       " u'b4%i': 773610,\n",
       " u'lgoma': 1567885,\n",
       " u'5uf4i': 403009,\n",
       " u'3yhyx': 296031,\n",
       " u'wydog': 2413627,\n",
       " u'a9b0': 648163,\n",
       " u'hola.': 1285856,\n",
       " u'hola ': 1285854,\n",
       " u\"hola'\": 1285855,\n",
       " u'fi ': 1141666,\n",
       " u'oial9': 1826694,\n",
       " u'84bdg': 523733,\n",
       " u'kwsta': 1526636,\n",
       " u'hola9': 1285865,\n",
       " u't1c1a': 2170132,\n",
       " u'jgn9p': 1433659,\n",
       " u'hola3': 1285860,\n",
       " u'hola2': 1285859,\n",
       " u'hola1': 1285858,\n",
       " u'hola0': 1285857,\n",
       " u'hola6': 1285863,\n",
       " u'hola5': 1285862,\n",
       " u'hola4': 1285861,\n",
       " u'kysum': 1529608,\n",
       " u'g4ad': 1175280,\n",
       " u'kysun': 1529609,\n",
       " u'eddwa': 1026577,\n",
       " u'kysuc': 1529606,\n",
       " u'fi%': 1141670,\n",
       " u'kysue': 1529607,\n",
       " u'iyoh1': 1407384,\n",
       " u'g4ar': 1175282,\n",
       " u'g4as': 1175284,\n",
       " u'peejm': 1910252,\n",
       " u'holak': 1285874,\n",
       " u'holai': 1285873,\n",
       " u'holah': 1285872,\n",
       " u'u12n3': 2248392,\n",
       " u'holan': 1285877,\n",
       " u'holam': 1285876,\n",
       " u'holal': 1285875,\n",
       " u'holac': 1285868,\n",
       " u'holab': 1285867,\n",
       " u'holaa': 1285866,\n",
       " u'holag': 1285871,\n",
       " u'holae': 1285870,\n",
       " u'holad': 1285869,\n",
       " u'holay': 1285886,\n",
       " u'holax': 1285885,\n",
       " u'holas': 1285881,\n",
       " u'holar': 1285880,\n",
       " u'holaq': 1285879,\n",
       " u'holap': 1285878,\n",
       " u'holaw': 1285884,\n",
       " u'holav': 1285883,\n",
       " u'uwuya': 2315876,\n",
       " u'holat': 1285882,\n",
       " u'wit5': 2390664,\n",
       " u'wit7': 2390667,\n",
       " u'wit0': 2390653,\n",
       " u'wit1': 2390655,\n",
       " u'wit2': 2390660,\n",
       " u'wit8': 2390669,\n",
       " u'wit9': 2390671,\n",
       " u'xhps2': 2435039,\n",
       " u'h09': 1237070,\n",
       " u'h08': 1237018,\n",
       " u'7a767': 487979,\n",
       " u'h03': 1236749,\n",
       " u'h02': 1236693,\n",
       " u'h01': 1236619,\n",
       " u'h00': 1236526,\n",
       " u'h07': 1236962,\n",
       " u'h06': 1236914,\n",
       " u'h05': 1236862,\n",
       " ...}"
      ]
     },
     "execution_count": 80,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# much larger vocabulary!\n",
    "\n",
    "five_cv.vocabulary_"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 81,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-12-14T03:53:24.195180Z",
     "start_time": "2018-12-14T03:53:24.178732Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "2570934"
      ]
     },
     "execution_count": 81,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# same number as shape attribute above\n",
    "len(five_cv.vocabulary_)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 82,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-12-14T03:54:37.396119Z",
     "start_time": "2018-12-14T03:53:24.202023Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<1048485x2922297 sparse matrix of type '<type 'numpy.int64'>'\n",
       "\twith 31080917 stored elements in Compressed Sparse Row format>"
      ]
     },
     "execution_count": 82,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# now let's count all 1, 2, 3, 4, and 5 character phrases\n",
    "five_cv_lower = CountVectorizer(ngram_range=(1, 5), analyzer='char', lowercase=False)\n",
    "\n",
    "five_char_lower = five_cv_lower.fit_transform(text)\n",
    "\n",
    "five_char_lower\n",
    "# there are 2,922,297 unique combo of up to 5-in-a-row-char phrases"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 83,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-12-14T03:54:38.014957Z",
     "start_time": "2018-12-14T03:54:37.401700Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(1, 2570934)\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "matrix([[1619465, 2166552, 1530799, 1981845, 2073035,  297134,  457130,\n",
       "          406411, 1792848,  352276, 1696853,  562360,  508193,  236639,\n",
       "         1308517,  994777,   36326,  171634,  629003,  100177]])"
      ]
     },
     "execution_count": 83,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# let's most common five char \"phrases\"\n",
    "# we will accomplish this by using numpy to do some quick math\n",
    "import numpy as np\n",
    "\n",
    "# first we will sum across the rows of our data to get the total count of phrases\n",
    "summed_features = np.sum(five_char, axis=0)\n",
    "\n",
    "print summed_features.shape\n",
    "\n",
    "# we will then sort the summed_features variable and grab the 20 most common phrases' indices in the CV's vocabulary\n",
    "top_20 = np.argsort(summed_features)[:,-20:]\n",
    "\n",
    "top_20"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 84,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-12-14T03:54:46.210658Z",
     "start_time": "2018-12-14T03:54:38.020397Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[u'm', u't', u'l', u'r', u's', u'4', u'7', u'6', u'o', u'5', u'n',\n",
       "        u'9', u'8', u'3', u'i', u'e', u'0', u'2', u'a', u'1']],\n",
       "      dtype='<U5')"
      ]
     },
     "execution_count": 84,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# plug these into the features of the CV.\n",
    "\n",
    "# sorting is done in ascending order so '1' is the most common phrase, followed by 'a'\n",
    "np.array(five_cv.get_feature_names())[top_20]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 85,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-12-14T03:54:52.887968Z",
     "start_time": "2018-12-14T03:54:46.216532Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[u'13', u'98', u'ng', u'21', u'01', u'er', u'in', u'20', u'10',\n",
       "        u'x', u'11', u'v', u'23', u'00', u'19', u'z', u'an', u'j', u'w',\n",
       "        u'f', u'12', u'p', u'y', u'b', u'k', u'g', u'h', u'c', u'd',\n",
       "        u'u', u'm', u't', u'l', u'r', u's', u'4', u'7', u'6', u'o', u'5',\n",
       "        u'n', u'9', u'8', u'3', u'i', u'e', u'0', u'2', u'a', u'1']],\n",
       "      dtype='<U5')"
      ]
     },
     "execution_count": 85,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# top 50 phrases\n",
    "np.array(five_cv.get_feature_names())[np.argsort(summed_features)[:,-50:]]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 86,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-12-14T03:55:00.804953Z",
     "start_time": "2018-12-14T03:54:52.893274Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[u'61', u'33', u'50', u'07', u'18', u'41', u'198', u'09', u'el',\n",
       "        u'80', u'lo', u'05', u're', u'ch', u'ia', u'03', u'90', u'89',\n",
       "        u'91', u'08', u'32', u'56', u'81', u'16', u'25', u'la', u'le',\n",
       "        u'51', u'as', u'34', u'al', u'45', u'ra', u'30', u'14', u'15',\n",
       "        u'02', u'ha', u'99', u'52', u'li', u'88', u'31', u'22', u'on',\n",
       "        u'123', u'ma', u'en', u'ar', u'q', u'13', u'98', u'ng', u'21',\n",
       "        u'01', u'er', u'in', u'20', u'10', u'x', u'11', u'v', u'23',\n",
       "        u'00', u'19', u'z', u'an', u'j', u'w', u'f', u'12', u'p', u'y',\n",
       "        u'b', u'k', u'g', u'h', u'c', u'd', u'u', u'm', u't', u'l', u'r',\n",
       "        u's', u'4', u'7', u'6', u'o', u'5', u'n', u'9', u'8', u'3', u'i',\n",
       "        u'e', u'0', u'2', u'a', u'1']], dtype='<U5')"
      ]
     },
     "execution_count": 86,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# top 100 phrases\n",
    "np.array(five_cv.get_feature_names())[np.argsort(summed_features)[:,-100:]]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 87,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-12-14T03:56:52.654055Z",
     "start_time": "2018-12-14T03:55:00.809356Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<1048485x7309977 sparse matrix of type '<type 'numpy.int64'>'\n",
       "\twith 16293052 stored elements in Compressed Sparse Row format>"
      ]
     },
     "execution_count": 87,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "seven_cv = CountVectorizer(ngram_range=(4, 7), analyzer='char', lowercase=False)\n",
    "\n",
    "seven_char = seven_cv.fit_transform(text)\n",
    "\n",
    "seven_char\n",
    "# "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 88,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-12-14T03:56:52.827138Z",
     "start_time": "2018-12-14T03:56:52.659070Z"
    }
   },
   "outputs": [],
   "source": [
    "summed_features = np.sum(seven_char, axis=0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 89,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-12-14T03:57:24.035255Z",
     "start_time": "2018-12-14T03:56:52.831174Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[u'1011', u'star', u'56789', u'g123', u'ming', u'long', u'ang1',\n",
       "        u'2002', u'3123', u'ing1', u'201314', u'2003', u'1992', u'2004',\n",
       "        u'1122', u'ling', u'2001', u'20131', u'woai', u'lian', u'feng',\n",
       "        u'2345678', u'1212', u'1101', u'01314', u'o123', u'345678',\n",
       "        u'ever', u's123', u'uang', u'1010', u'1980', u'huan', u'i123',\n",
       "        u'king', u'mari', u'2005', u'hong', u'6789', u'1981', u'00000',\n",
       "        u'45678', u'2013', u'11111', u'1991', u'1231', u'ilove',\n",
       "        u'admin', u'ilov', u'ange', u'2006', u'0131', u'admi', u'heng',\n",
       "        u'1234567', u'5201', u'e123', u'234567', u'dmin', u'pass',\n",
       "        u'8888', u'34567', u'zhang', u'jian', u'2007', u'5678', u'1982',\n",
       "        u'2000', u'zhan', u'yang', u'n123', u'1983', u'4567', u'1984',\n",
       "        u'1990', u'a123', u'2009', u'ster', u'1985', u'iang', u'2008',\n",
       "        u'2010', u'xiao', u'chen', u'hang', u'wang', u'1986', u'1111',\n",
       "        u'1989', u'0000', u'1988', u'1987', u'1314', u'love', u'123456',\n",
       "        u'23456', u'3456', u'12345', u'2345', u'1234']], dtype='<U7')"
      ]
     },
     "execution_count": 89,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "summed_features = np.sum(seven_char, axis=0)\n",
    "\n",
    "np.array(seven_cv.get_feature_names())[np.argsort(summed_features)[:,-100:]]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 90,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-12-14T03:57:24.048523Z",
     "start_time": "2018-12-14T03:57:24.041431Z"
    }
   },
   "outputs": [],
   "source": [
    "# Term Frequency-Inverse Document Frequency (TF-IDF)\n",
    "\n",
    "# What: Computes \"relative frequency\" of a word that appears in a document compared to its frequency \n",
    "# across all documents\n",
    "\n",
    "# Why: More useful than \"term frequency\" for identifying \"important\" words/phrases in each document \n",
    "# (high frequency in that document, low frequency in other documents)\n",
    "\n",
    "# Notes: Used for search engine scoring, text summarization, document clustering"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 91,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-12-14T03:57:24.059771Z",
     "start_time": "2018-12-14T03:57:24.054217Z"
    }
   },
   "outputs": [],
   "source": [
    "from sklearn.feature_extraction.text import TfidfVectorizer"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 92,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-12-14T03:57:35.697376Z",
     "start_time": "2018-12-14T03:57:24.065733Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<1048485x70 sparse matrix of type '<type 'numpy.float64'>'\n",
       "\twith 6935190 stored elements in Compressed Sparse Row format>"
      ]
     },
     "execution_count": 92,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "one_tv = TfidfVectorizer(ngram_range=(1, 1), analyzer='char')\n",
    "\n",
    "# once we instantiate the module, we will call upon the fit_transform method to learn the vocabulary and then\n",
    "# transform our text series into a brand new matrix called one_char\n",
    "# Previously we created a matrix of quantitative data by applying our own functions, now we are creating numerical\n",
    "# matrices using sklearn\n",
    "one_char_tf = one_tv.fit_transform(text)\n",
    "\n",
    "# same shape as CountVectorizer\n",
    "one_char_tf"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 93,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-12-14T03:57:35.760488Z",
     "start_time": "2018-12-14T03:57:35.703791Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th>!</th>\n",
       "      <th>\"</th>\n",
       "      <th>#</th>\n",
       "      <th>$</th>\n",
       "      <th>%</th>\n",
       "      <th>&amp;</th>\n",
       "      <th>'</th>\n",
       "      <th>(</th>\n",
       "      <th>...</th>\n",
       "      <th>u</th>\n",
       "      <th>v</th>\n",
       "      <th>w</th>\n",
       "      <th>x</th>\n",
       "      <th>y</th>\n",
       "      <th>z</th>\n",
       "      <th>{</th>\n",
       "      <th>|</th>\n",
       "      <th>}</th>\n",
       "      <th>~</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.408704</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.369502</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>1 rows × 70 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "    \\r         !    \"    #    $    %    &    '    ( ...     u    v         w  \\\n",
       "0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0 ...   0.0  0.0  0.408704   \n",
       "\n",
       "     x         y    z    {    |    }    ~  \n",
       "0  0.0  0.369502  0.0  0.0  0.0  0.0  0.0  \n",
       "\n",
       "[1 rows x 70 columns]"
      ]
     },
     "execution_count": 93,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# transforming a new password\n",
    "pd.DataFrame(one_tv.transform(['qwerty123']).toarray(), columns=one_tv.get_feature_names())\n",
    "\n",
    "# it is no longer counts anymore, it is a calcualtion involving relative frequency"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 94,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-12-14T03:59:06.094671Z",
     "start_time": "2018-12-14T03:57:35.765456Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<1048485x2570934 sparse matrix of type '<type 'numpy.float64'>'\n",
       "\twith 31053193 stored elements in Compressed Sparse Row format>"
      ]
     },
     "execution_count": 94,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# make a five-char TfidfVectorizer\n",
    "five_tv = TfidfVectorizer(ngram_range=(1, 5), analyzer='char')\n",
    "\n",
    "five_char_tf = five_tv.fit_transform(text)\n",
    "\n",
    "# same shape as CountVectorizer\n",
    "five_char_tf"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 95,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-12-14T03:59:13.736318Z",
     "start_time": "2018-12-14T03:59:06.100344Z"
    }
   },
   "outputs": [],
   "source": [
    "# Let's see some tfidf values of passwords\n",
    "\n",
    "# store the feature names as a numpy array\n",
    "features = np.array(five_tv.get_feature_names())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 96,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-12-14T03:59:13.766610Z",
     "start_time": "2018-12-14T03:59:13.744322Z"
    }
   },
   "outputs": [],
   "source": [
    "# transform a very simple password\n",
    "abc_transformed = five_tv.transform(['abc123'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 97,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-12-14T03:59:13.788480Z",
     "start_time": "2018-12-14T03:59:13.773120Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([u'c123', u'c12', u'c1', u'c', u'bc123', u'bc12', u'bc1', u'bc',\n",
       "       u'b', u'abc12', u'abc1', u'abc', u'ab', u'a', u'3', u'23', u'2',\n",
       "       u'123', u'12', u'1'], dtype='<U5')"
      ]
     },
     "execution_count": 97,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# grab the non zero features ie the ngrams that actually exist\n",
    "features[abc_transformed.nonzero()[1]]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 98,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-12-14T03:59:13.819164Z",
     "start_time": "2018-12-14T03:59:13.796094Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "matrix([[0.28865293, 0.27817216, 0.23180301, 0.10303378, 0.33609531,\n",
       "         0.33285593, 0.31079987, 0.23023187, 0.11165455, 0.33695385,\n",
       "         0.31813905, 0.25043863, 0.18481603, 0.07089031, 0.08285116,\n",
       "         0.13324432, 0.07449711, 0.15211427, 0.12089443, 0.06747844]])"
      ]
     },
     "execution_count": 98,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# grab the non zero tfidf scores of the features\n",
    "abc_transformed[abc_transformed.nonzero()]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 99,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-12-14T03:59:13.866446Z",
     "start_time": "2018-12-14T03:59:13.825655Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>c123</th>\n",
       "      <th>c12</th>\n",
       "      <th>c1</th>\n",
       "      <th>c</th>\n",
       "      <th>bc123</th>\n",
       "      <th>bc12</th>\n",
       "      <th>bc1</th>\n",
       "      <th>bc</th>\n",
       "      <th>b</th>\n",
       "      <th>abc12</th>\n",
       "      <th>abc1</th>\n",
       "      <th>abc</th>\n",
       "      <th>ab</th>\n",
       "      <th>a</th>\n",
       "      <th>3</th>\n",
       "      <th>23</th>\n",
       "      <th>2</th>\n",
       "      <th>123</th>\n",
       "      <th>12</th>\n",
       "      <th>1</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0.288653</td>\n",
       "      <td>0.278172</td>\n",
       "      <td>0.231803</td>\n",
       "      <td>0.103034</td>\n",
       "      <td>0.336095</td>\n",
       "      <td>0.332856</td>\n",
       "      <td>0.3108</td>\n",
       "      <td>0.230232</td>\n",
       "      <td>0.111655</td>\n",
       "      <td>0.336954</td>\n",
       "      <td>0.318139</td>\n",
       "      <td>0.250439</td>\n",
       "      <td>0.184816</td>\n",
       "      <td>0.07089</td>\n",
       "      <td>0.082851</td>\n",
       "      <td>0.133244</td>\n",
       "      <td>0.074497</td>\n",
       "      <td>0.152114</td>\n",
       "      <td>0.120894</td>\n",
       "      <td>0.067478</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "       c123       c12        c1         c     bc123      bc12     bc1  \\\n",
       "0  0.288653  0.278172  0.231803  0.103034  0.336095  0.332856  0.3108   \n",
       "\n",
       "         bc         b     abc12      abc1       abc        ab        a  \\\n",
       "0  0.230232  0.111655  0.336954  0.318139  0.250439  0.184816  0.07089   \n",
       "\n",
       "          3        23         2       123        12         1  \n",
       "0  0.082851  0.133244  0.074497  0.152114  0.120894  0.067478  "
      ]
     },
     "execution_count": 99,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# put them together in a DataFrame\n",
    "pd.DataFrame(abc_transformed[abc_transformed.nonzero()], \n",
    "             columns=features[abc_transformed.nonzero()[1]])\n",
    "\n",
    "# Note that \"1\" has a tfidf score of 0.067478 while \"bc123\" has a score of 0.336095, implying that\n",
    "# \"bc123\" is more \"interesting\" than \"1\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 100,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-12-14T03:59:13.912694Z",
     "start_time": "2018-12-14T03:59:13.872373Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>sdf</th>\n",
       "      <th>sd</th>\n",
       "      <th>s</th>\n",
       "      <th>rf</th>\n",
       "      <th>r</th>\n",
       "      <th>f%</th>\n",
       "      <th>f</th>\n",
       "      <th>erf</th>\n",
       "      <th>er</th>\n",
       "      <th>e</th>\n",
       "      <th>df</th>\n",
       "      <th>d</th>\n",
       "      <th>%er</th>\n",
       "      <th>%e</th>\n",
       "      <th>%</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0.260206</td>\n",
       "      <td>0.202144</td>\n",
       "      <td>0.081675</td>\n",
       "      <td>0.223152</td>\n",
       "      <td>0.081535</td>\n",
       "      <td>0.431418</td>\n",
       "      <td>0.22533</td>\n",
       "      <td>0.267245</td>\n",
       "      <td>0.127272</td>\n",
       "      <td>0.071568</td>\n",
       "      <td>0.217317</td>\n",
       "      <td>0.093965</td>\n",
       "      <td>0.453607</td>\n",
       "      <td>0.405458</td>\n",
       "      <td>0.2692</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "        sdf        sd         s        rf         r        f%        f  \\\n",
       "0  0.260206  0.202144  0.081675  0.223152  0.081535  0.431418  0.22533   \n",
       "\n",
       "        erf        er         e        df         d       %er        %e  \\\n",
       "0  0.267245  0.127272  0.071568  0.217317  0.093965  0.453607  0.405458   \n",
       "\n",
       "        %  \n",
       "0  0.2692  "
      ]
     },
     "execution_count": 100,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Do the same with a harder password\n",
    "\n",
    "# transform a better password\n",
    "password_transformed = five_tv.transform(['sdf%ERF'])\n",
    "\n",
    "# grab the non zero features\n",
    "features[password_transformed.nonzero()[1]]\n",
    "\n",
    "# grab the non zero tfidf scores of the features\n",
    "password_transformed[password_transformed.nonzero()]\n",
    "\n",
    "# put them together in a DataFrame\n",
    "pd.DataFrame(password_transformed[password_transformed.nonzero()], columns=features[password_transformed.nonzero()[1]])\n",
    "\n",
    "# Note the larger tfidf values  of \"%er\" vs \"123\" (0.453607 vs 0.152114) This implies that \n",
    "# %er is more \"interesting\" and occurs less often"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 101,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-12-14T03:59:13.925268Z",
     "start_time": "2018-12-14T03:59:13.918718Z"
    }
   },
   "outputs": [],
   "source": [
    "# some other params to play with"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 102,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-12-14T03:59:29.584990Z",
     "start_time": "2018-12-14T03:59:13.931829Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<1048485x7528 sparse matrix of type '<type 'numpy.int64'>'\n",
       "\twith 14350326 stored elements in Compressed Sparse Row format>"
      ]
     },
     "execution_count": 102,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "two_cv = CountVectorizer(ngram_range=(1, 2), analyzer='char', lowercase=False)\n",
    "\n",
    "two_char = two_cv.fit_transform(text)\n",
    "\n",
    "two_char\n",
    "# there are 7,528 unique 2-in-a-row-chars (number of columns)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 103,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-12-14T03:59:45.504795Z",
     "start_time": "2018-12-14T03:59:29.589023Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<1048485x10 sparse matrix of type '<type 'numpy.int64'>'\n",
       "\twith 3070088 stored elements in Compressed Sparse Row format>"
      ]
     },
     "execution_count": 103,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# manipulate some other parameters to see their use\n",
    "# In CountVectorizer, max_features keeps the n most frequent tokens\n",
    "# the TfidfVectorizer will keep the n tokens with highest tfidf-score\n",
    "two_cv = CountVectorizer(ngram_range=(1, 2), analyzer='char', max_features=10)\n",
    "\n",
    "two_cv.fit_transform(text)\n",
    "# there are now only the 10 columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 104,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-12-14T04:00:01.177043Z",
     "start_time": "2018-12-14T03:59:45.510179Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<1048485x7500 sparse matrix of type '<type 'numpy.int64'>'\n",
       "\twith 8209133 stored elements in Compressed Sparse Row format>"
      ]
     },
     "execution_count": 104,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# max_df only keeps tokens that appears in at most max_df% documents, default is 1.0\n",
    "\n",
    "# only keep tokens that appear at most in 10% of documents\n",
    "two_cv = CountVectorizer(ngram_range=(1, 2), analyzer='char', lowercase=False, max_df=.1)\n",
    "\n",
    "two_cv.fit_transform(text)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 105,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-12-14T04:00:17.607674Z",
     "start_time": "2018-12-14T04:00:01.183693Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<1048485x28 sparse matrix of type '<type 'numpy.int64'>'\n",
       "\twith 6141193 stored elements in Compressed Sparse Row format>"
      ]
     },
     "execution_count": 105,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# min_df only keeps tokens that appears in at at least min_df% documents\n",
    "\n",
    "# only keep tokens that appear at at least 10% of documents\n",
    "two_cv = CountVectorizer(ngram_range=(1, 2), analyzer='char', lowercase=False, min_df=.1)\n",
    "\n",
    "two_cv.fit_transform(text)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 106,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-12-14T04:00:17.624851Z",
     "start_time": "2018-12-14T04:00:17.613429Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{u'0': 0,\n",
       " u'1': 1,\n",
       " u'2': 2,\n",
       " u'3': 3,\n",
       " u'4': 4,\n",
       " u'5': 5,\n",
       " u'6': 6,\n",
       " u'7': 7,\n",
       " u'8': 8,\n",
       " u'9': 9,\n",
       " u'a': 10,\n",
       " u'b': 11,\n",
       " u'c': 12,\n",
       " u'd': 13,\n",
       " u'e': 14,\n",
       " u'g': 15,\n",
       " u'h': 16,\n",
       " u'i': 17,\n",
       " u'k': 18,\n",
       " u'l': 19,\n",
       " u'm': 20,\n",
       " u'n': 21,\n",
       " u'o': 22,\n",
       " u'r': 23,\n",
       " u's': 24,\n",
       " u't': 25,\n",
       " u'u': 26,\n",
       " u'y': 27}"
      ]
     },
     "execution_count": 106,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "two_cv.vocabulary_  # makes sense that these appear in over 10% of documents"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 107,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-12-14T04:00:17.636043Z",
     "start_time": "2018-12-14T04:00:17.630259Z"
    }
   },
   "outputs": [],
   "source": [
    "# these parameters work both for Count and Tfidf Vectorizers"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 108,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-12-14T04:00:17.648276Z",
     "start_time": "2018-12-14T04:00:17.641379Z"
    }
   },
   "outputs": [],
   "source": [
    "# Cosine Similarity: a quantitative measure [-1,1] of how similar two vectors are in a Vector Space\n",
    "# the closer they are to each other, the smaller the angle between them comes\n",
    "# the smaller the angle between them are, the larger the cosine of that angle\n",
    "\n",
    "# eg. if two vectors are opposite of each other, their angle is 180, and cos(0) = -1\n",
    "# eg. if two vectors are the same, their angle is 0, and cos(0) = 1\n",
    "# eg. if two vectors are perpendicular, their angle is 90, and cos(90) = 0\n",
    "    # in the text world, we'd say that these documents are unrelated"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "<img src=\"https://lh4.googleusercontent.com/SodVc3Xo77b8LhEjqXymSaA-bI-kQdPeY8uG-J0wSSp5q-pxVAf_rPMUX9Y\">"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 109,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-12-14T04:00:17.698534Z",
     "start_time": "2018-12-14T04:00:17.653439Z"
    }
   },
   "outputs": [],
   "source": [
    "# Let's build a tool that takes in a password from a user and we will spit back a recommendation of how powerful\n",
    "# that password is. This can be done in countless ways through various approaches. We will propose one now.\n",
    "\n",
    "from sklearn.metrics.pairwise import cosine_similarity\n",
    "# number between -1 and 1 (-1 is disimilar, 1 is very similar (the same))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 110,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-12-14T04:00:17.716835Z",
     "start_time": "2018-12-14T04:00:17.704053Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[[1.]]\n",
      "[[-1.]]\n",
      "[[0.]]\n"
     ]
    }
   ],
   "source": [
    "# same vectors = [1, 1], [1, 1]\n",
    "print cosine_similarity(np.array([1, 1]).reshape(1, -1), np.array([1, 1]).reshape(1, -1))\n",
    "\n",
    "# opposite vectors = [1, 1], [-1, -1]\n",
    "print cosine_similarity(np.array([1, 1]).reshape(1, -1), np.array([-1, -1]).reshape(1, -1))\n",
    "\n",
    "# perpendicualr vectors = [1, 0], [0, 1]\n",
    "print cosine_similarity(np.array([1, 0]).reshape(1, -1), np.array([0, 1]).reshape(1, -1))\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 111,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-12-14T04:00:17.733397Z",
     "start_time": "2018-12-14T04:00:17.721953Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "CountVectorizer(analyzer='char', binary=False, decode_error=u'strict',\n",
       "        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',\n",
       "        lowercase=True, max_df=1.0, max_features=None, min_df=1,\n",
       "        ngram_range=(1, 5), preprocessor=None, stop_words=None,\n",
       "        strip_accents=None, token_pattern=u'(?u)\\\\b\\\\w\\\\w+\\\\b',\n",
       "        tokenizer=None, vocabulary=None)"
      ]
     },
     "execution_count": 111,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "five_cv"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 112,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-12-14T04:00:17.786513Z",
     "start_time": "2018-12-14T04:00:17.738864Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[[0.88873334]]\n",
      "[[0.08520286]]\n"
     ]
    }
   ],
   "source": [
    "# similar phrases\n",
    "print cosine_similarity(five_cv.transform([\"helo there\"]), five_cv.transform([\"hello there\"]))\n",
    "\n",
    "# not similar phrases\n",
    "print cosine_similarity(five_cv.transform([\"sddgnkjfnsdlkfjnwe4r\"]), five_cv.transform([\"hello there\"]))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 113,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-12-14T04:00:17.797493Z",
     "start_time": "2018-12-14T04:00:17.791579Z"
    }
   },
   "outputs": [],
   "source": [
    "# store a password that we may want to use in a variable\n",
    "attempted_password=\"qwerty123\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 152,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-12-17T00:14:14.391126Z",
     "start_time": "2018-12-17T00:14:11.201425Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(1, 1048485)"
      ]
     },
     "execution_count": 152,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "cosine_similarity(five_cv.transform([attempted_password]), five_char).shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 114,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-12-14T04:00:24.322272Z",
     "start_time": "2018-12-14T04:00:17.802960Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1.0000000000000002"
      ]
     },
     "execution_count": 114,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# use cosine similarity to find the closest password in our dataset to our attempted password\n",
    "# qwerty123 is a literal exact password :(\n",
    "cosine_similarity(five_cv.transform([attempted_password]), five_char).max()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 115,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-12-14T04:00:28.042392Z",
     "start_time": "2018-12-14T04:00:24.327287Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.8864820021501024"
      ]
     },
     "execution_count": 115,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# lets make it harder\n",
    "attempted_password=\"qwertyqwerty123456234\"\n",
    "\n",
    "# still pretty similar to other passwords..\n",
    "cosine_similarity(five_cv.transform([attempted_password]), five_char).max()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 116,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-12-14T04:00:30.928308Z",
     "start_time": "2018-12-14T04:00:28.047145Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.5533028716685016"
      ]
     },
     "execution_count": 116,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# fine lets make it even harder\n",
    "attempted_password=\"asfkKwrvn#%^&@Gfgg\"\n",
    "\n",
    "# much better!\n",
    "cosine_similarity(five_cv.transform([attempted_password]), five_char).max()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 117,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-12-14T04:00:34.058926Z",
     "start_time": "2018-12-14T04:00:30.934744Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.8968577221668298"
      ]
     },
     "execution_count": 117,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# use the top 20 most similar password mean score\n",
    "attempted_password=\"qwerty123\"\n",
    "\n",
    "raw_vectorization = cosine_similarity(five_cv.transform([attempted_password]), five_char)\n",
    "raw_vectorization[:,np.argsort(raw_vectorization)[0,-20:]].mean()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 118,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-12-14T04:00:37.280732Z",
     "start_time": "2018-12-14T04:00:34.064106Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.781506921049545"
      ]
     },
     "execution_count": 118,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# use the top 20 most similar password mean score\n",
    "attempted_password=\"qwertyqwerty123456234\"\n",
    "\n",
    "raw_vectorization = cosine_similarity(five_cv.transform([attempted_password]), five_char)\n",
    "raw_vectorization[:,np.argsort(raw_vectorization)[0,-20:]].mean()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 119,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-12-14T04:00:40.323833Z",
     "start_time": "2018-12-14T04:00:37.286144Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.42202078259866244"
      ]
     },
     "execution_count": 119,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# use the top 20 most similar password mean score\n",
    "attempted_password=\"asfkKwrvn#%^&@Gfgg\"\n",
    "\n",
    "raw_vectorization = cosine_similarity(five_cv.transform([attempted_password]), five_char)\n",
    "raw_vectorization[:,np.argsort(raw_vectorization)[0,-20:]].mean()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 120,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-12-14T04:00:56.678037Z",
     "start_time": "2018-12-14T04:00:40.330248Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<1048485x7528 sparse matrix of type '<type 'numpy.int64'>'\n",
       "\twith 14350326 stored elements in Compressed Sparse Row format>"
      ]
     },
     "execution_count": 120,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# remake a simple two char CV\n",
    "two_cv = CountVectorizer(ngram_range=(1, 2), analyzer='char', lowercase=False)\n",
    "\n",
    "two_char = two_cv.fit_transform(text)\n",
    "\n",
    "two_char\n",
    "# there are 7,528 unique 2-in-a-row-chars (number of columns)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 121,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-12-14T04:00:58.436466Z",
     "start_time": "2018-12-14T04:00:56.683809Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0.7891138171613815\n",
      "0.4714839309812973\n"
     ]
    }
   ],
   "source": [
    "# make a simple function using the two_char CV and matrix\n",
    "def get_closest_word_similarity(password):\n",
    "    raw_vectorization = cosine_similarity(two_cv.transform([password]), two_char)\n",
    "    return raw_vectorization[:,np.argsort(raw_vectorization)[0,-20:]].mean()\n",
    "\n",
    "print get_closest_word_similarity(\"guest123\")  # very close to passwords in the db\n",
    "\n",
    "print get_closest_word_similarity(\"sdfFSKSJNDFKFSD3253245sadSDF@@$@#$\")  # not very close to passwords in the db"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 122,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-12-14T04:00:58.478812Z",
     "start_time": "2018-12-14T04:00:58.445085Z"
    }
   },
   "outputs": [],
   "source": [
    "# this is a complete data-driven automated password strength tester that judges passwords without any human intuition.\n",
    "\n",
    "class PasswordTester():\n",
    "    def __init__(self, text):\n",
    "        self.vectorizer = None\n",
    "        self.password_matrix = None\n",
    "        self.text = text\n",
    "    \n",
    "    def make_vectorizer(self, **kwargs):\n",
    "        self.vectorizer = CountVectorizer(**kwargs)\n",
    "        self.password_matrix = self.vectorizer.fit_transform(self.text)\n",
    "        \n",
    "    def get_closest_word_similarity(self, password):\n",
    "        raw_vectorization = cosine_similarity(self.vectorizer.transform([password]), self.password_matrix)\n",
    "        return raw_vectorization[:,np.argsort(raw_vectorization)[0,-20:]].mean()\n",
    "\n",
    "    def judge_password(self, attempted_password):\n",
    "        badness_score = self.get_closest_word_similarity(attempted_password)\n",
    "        if badness_score > .9:\n",
    "            return \"very poor\", badness_score\n",
    "        elif badness_score > .8:\n",
    "            return \"poor\", badness_score\n",
    "        elif badness_score > .6:\n",
    "            return \"not bad\", badness_score\n",
    "        elif badness_score > .4:\n",
    "            return \"good\", badness_score\n",
    "        else:\n",
    "            return \"very good\", badness_score\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 123,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-12-14T04:01:14.415617Z",
     "start_time": "2018-12-14T04:00:58.484218Z"
    }
   },
   "outputs": [],
   "source": [
    "p = PasswordTester(text)\n",
    "p.make_vectorizer(ngram_range=(1, 2), analyzer='char', lowercase=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 124,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-12-14T04:01:15.547401Z",
     "start_time": "2018-12-14T04:01:14.422085Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "('poor', 0.8624222257655552)"
      ]
     },
     "execution_count": 124,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "p.judge_password(\"password123321\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 125,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-12-14T04:01:16.136115Z",
     "start_time": "2018-12-14T04:01:15.553510Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "('not bad', 0.7928432151071905)"
      ]
     },
     "execution_count": 125,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "p.judge_password(\"Istanbul9999\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 126,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-12-14T04:01:16.634674Z",
     "start_time": "2018-12-14T04:01:16.140104Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "('good', 0.41329460236856164)"
      ]
     },
     "execution_count": 126,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# generated from LastPass, 10 digit\n",
    "p.judge_password(\"D9GLRyG0*!\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 127,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-12-14T04:01:17.140925Z",
     "start_time": "2018-12-14T04:01:16.639920Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "('very good', 0.3628996523892102)"
      ]
     },
     "execution_count": 127,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# generated from LastPass, 100 digit\n",
    "p.judge_password(\"ES%9G1UxtoBlwn^e&Bz3bAj2hMfk!2cfj8kF8yUc&J2B&khzNpBoe65Va!*XGXH1&PF5fxbKGpBsvPNQdnmnWyzb@W$tcn^%fnKa\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 128,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-12-14T04:01:17.149283Z",
     "start_time": "2018-12-14T04:01:17.144755Z"
    }
   },
   "outputs": [],
   "source": [
    "# bonus content! FeatureUnions in scikit-learn to combine handcrafted adn automated features"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 129,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-12-14T04:01:17.166307Z",
     "start_time": "2018-12-14T04:01:17.153686Z"
    }
   },
   "outputs": [],
   "source": [
    "def create_human_features(text_col):  # this should be a pandas Series\n",
    "    df = pd.DataFrame()\n",
    "    df['special'] = text_col.apply(count_special_characters)\n",
    "    df['length'] = text_col.apply(len)\n",
    "    df['caps'] = text_col.apply(caps)\n",
    "\n",
    "    return df\n",
    "    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 130,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-12-14T04:01:21.115695Z",
     "start_time": "2018-12-14T04:01:17.169190Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>special</th>\n",
       "      <th>length</th>\n",
       "      <th>caps</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "      <td>10</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0</td>\n",
       "      <td>12</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>0</td>\n",
       "      <td>8</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>0</td>\n",
       "      <td>9</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0</td>\n",
       "      <td>7</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>0</td>\n",
       "      <td>8</td>\n",
       "      <td>8</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>0</td>\n",
       "      <td>9</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>0</td>\n",
       "      <td>10</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>0</td>\n",
       "      <td>9</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>0</td>\n",
       "      <td>8</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   special  length  caps\n",
       "0        0      10     0\n",
       "1        0      12     0\n",
       "2        0       8     0\n",
       "3        0       9     0\n",
       "4        0       7     0\n",
       "5        0       8     8\n",
       "6        0       9     1\n",
       "7        0      10     0\n",
       "8        0       9     0\n",
       "9        0       8     0"
      ]
     },
     "execution_count": 130,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "create_human_features(text).head(10)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 131,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-12-14T04:01:21.130423Z",
     "start_time": "2018-12-14T04:01:21.121294Z"
    }
   },
   "outputs": [],
   "source": [
    "# use sklearn to turn our function into a transformer, like CountVectorizer and TfidfVectorizer\n",
    "from sklearn.preprocessing import FunctionTransformer\n",
    "\n",
    "# If validate is false, there will be no input validation. \n",
    "my_vectorizer = FunctionTransformer(create_human_features, validate=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 132,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-12-14T04:01:24.686028Z",
     "start_time": "2018-12-14T04:01:21.135788Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>special</th>\n",
       "      <th>length</th>\n",
       "      <th>caps</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "      <td>10</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0</td>\n",
       "      <td>12</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>0</td>\n",
       "      <td>8</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>0</td>\n",
       "      <td>9</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0</td>\n",
       "      <td>7</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   special  length  caps\n",
       "0        0      10     0\n",
       "1        0      12     0\n",
       "2        0       8     0\n",
       "3        0       9     0\n",
       "4        0       7     0"
      ]
     },
     "execution_count": 132,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "my_vectorizer.fit_transform(text).head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 133,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-12-14T04:01:24.700122Z",
     "start_time": "2018-12-14T04:01:24.690563Z"
    }
   },
   "outputs": [],
   "source": [
    "# a scikit-learn feature to combine multiple vectorizers\n",
    "from sklearn.pipeline import FeatureUnion"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 134,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-12-14T04:01:40.677205Z",
     "start_time": "2018-12-14T04:01:24.704353Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(1048485, 1042176)"
      ]
     },
     "execution_count": 134,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "cv = CountVectorizer()\n",
    "cv.fit_transform(text).shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 135,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-12-14T04:01:44.002895Z",
     "start_time": "2018-12-14T04:01:40.683445Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(1048485, 3)"
      ]
     },
     "execution_count": 135,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "my_vectorizer = FunctionTransformer(create_human_features, validate=False)\n",
    "my_vectorizer.fit_transform(text).shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 136,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-12-14T04:02:02.083944Z",
     "start_time": "2018-12-14T04:01:44.008604Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(1048485, 1042179)"
      ]
     },
     "execution_count": 136,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "fu = FeatureUnion([(\"count\", cv), \n",
    "                   (\"my_vectorizer\", my_vectorizer)])\n",
    "fu.fit_transform(text).shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 137,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-12-14T04:02:02.101669Z",
     "start_time": "2018-12-14T04:02:02.090274Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1042179"
      ]
     },
     "execution_count": 137,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "1042176 + 3"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
